In [39]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [9]:
df = pd.read_csv("Chemical_descriptors.csv")
from sklearn.datasets import load_breast_cancer

In [10]:
breast = load_breast_cancer()

In [11]:
breast_data = breast.data

In [12]:
breast_data.shape

(569, 30)

In [13]:
df.shape

(55, 220)

In [14]:
df.target

AttributeError: 'DataFrame' object has no attribute 'target'

In [15]:
features = df.features

AttributeError: 'DataFrame' object has no attribute 'features'

In [16]:
features = df,(220,1)

In [17]:
features


(    Unnamed: 0          #ID                                name     reference  \
 0            1       MNXM01                                 PMF       mnx:PMF   
 1            4        MNXM1                                H(+)    mnx:PROTON   
 2            5       MNXM10                                NADH   chebi:57945   
 3        99546  MNXM1092518                       cis-aconitate   chebi:16383   
 4       109565  MNXM1104266                          acetyl-CoA   chebi:57288   
 5       109687  MNXM1104491     6-phospho-D-glucono-1,5-lactone   chebi:57955   
 6       109839  MNXM1104774                        succinyl-CoA   chebi:57292   
 7       110046  MNXM1105029                           D-glucose    chebi:4167   
 8       112445  MNXM1107192                          (S)-malate   chebi:15589   
 9       113068  MNXM1107753                             citrate   chebi:16947   
 10      113423  MNXM1108073  (2R)-3-phospho-glyceroyl phosphate   chebi:57604   
 11      113444 

In [18]:
df.tail()

Unnamed: 0.1,Unnamed: 0,#ID,name,reference,formula,charge,mass,InChI,InChIKey,SMILES,...,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,BiGG,Species_ID
50,826038,MNXM77,dihydroxyacetone phosphate,chebi:57642,C3H5O6P,-2.0,167.98347,"InChI=1S/C3H7O6P/c4-1-3(5)2-9-10(6,7)8/h4H,1-2...",InChIKey=GNGACRATGGDKBX-UHFFFAOYSA-L,O=C(CO)COP(=O)([O-])[O-],...,0,0,0,0,0,0,0,0,dhap,M_dhap_c
51,856684,MNXM8,NAD(+),chebi:57540,C21H26N7O14P2,-1.0,662.10185,InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-17...,InChIKey=BAWFJGJZGIEFAR-NNYOXOHSSA-M,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,...,0,0,0,0,0,0,0,0,f6p,M_f6p_c
52,961123,MNXM89661,isocitrate(3-),chebi:16087,C6H5O7,-3.0,189.00517,InChI=1S/C6H8O7/c7-3(8)1-2(5(10)11)4(9)6(12)13...,InChIKey=ODBLHEXUDAPZAU-UHFFFAOYSA-K,O=C([O-])CC(C(=O)[O-])C(O)C(=O)[O-],...,0,0,0,0,0,0,0,0,icit,M_icit_c
53,964543,MNXM9,phosphate,chebi:43474,HO4P,-2.0,95.96234,"InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4)/p-2",InChIKey=NBIIXXVUZAFLBC-UHFFFAOYSA-L,O=P([O-])([O-])O,...,0,0,0,0,0,0,0,0,pi,M_pi_c
54,1066989,WATER,H2O,mnx:WATER,H2O,0.0,18.01056,InChI=1S/H2O/h1H2,InChIKey=XLYOFNOQVPJJNP-UHFFFAOYSA-N,[H]O[H],...,0,0,0,0,0,0,0,0,h20,M_h2o_c


# Replace labels based on string or numerical identity

In [21]:
df["name"].replace("LOL", "CO2",inplace=True)

In [22]:
breast_labels = breast.target

In [23]:
labels = np.reshape(breast_labels,(569,1))

In [24]:
final_breast_data = np.concatenate([breast_data,labels],axis=1)

In [25]:
final_breast_data.shape

(569, 31)

In [26]:
breast_dataset = pd.DataFrame(final_breast_data)


In [27]:
breast_features = breast.feature_names

In [28]:
breast_features

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [29]:
features_labels = np.append(breast_features, "label")

In [30]:
breast_dataset.columns = features_labels

In [31]:
breast_dataset.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [32]:
breast_dataset["label"].replace(0, "Benign", inplace=True)

In [33]:
breast_dataset["label"].replace(1,"Malignant",inplace=True)

In [34]:
df.columns

Index(['Unnamed: 0', '#ID', 'name', 'reference', 'formula', 'charge', 'mass',
       'InChI', 'InChIKey', 'SMILES',
       ...
       'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole',
       'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'BiGG',
       'Species_ID'],
      dtype='object', length=220)

In [42]:
x = df.loc[:, df.columns].values
df2 = pd.read_csv("chemical_properties.csv")
xx = df.loc[:, df2.columns].values

In [43]:
xx = StandardScaler().fit_transform(xx)