Goal: Generate 2D descriptors of the molecules in the smiles strings

In [98]:
# Read in 1000 molecule csv
import pandas as pd
import numpy as np

In [23]:
# Check to make sure everything imported correctly
sample_mols_df = pd.read_csv('rdf_sampled_dirads.csv')
sample_mols_df.head(5)

Unnamed: 0.1,Unnamed: 0,rid,molecule,bond_index,fragment1,fragment2,bde,bdfe,bdscfe,cart,conj
0,319292,,CC(C)N1CCOC1=O,3,[CH2]COC(=O)[N]C(C)C,,85.25902,80.915403,89.557518,4.593091,9999
1,267257,,CC(C)[C@@H](C)SC1COC1,7,[CH2]C(C[O])S[C@H](C)C(C)C,,58.543452,57.581481,62.370026,3.701196,9999
2,253493,,CCC1=CNCCC1,7,[CH2]CCNC=[C]CC,,97.037992,91.984662,100.944869,5.10723,9999
3,192932,,c1nnc2n1CCCCC2,5,[CH2]CCCc1nncn1[CH2],,81.884277,76.576806,86.934183,5.37044,9999
4,323812,,O=C1CC[C@H](CO)O1,2,[CH2]C(=O)O[C@H]([CH2])CO,,79.525471,76.566766,83.731931,4.610139,9999


In [28]:
# Pick 10 strings from the 1000 randomly sampled
subset = sample_mols_df.sample(n=10)
subset

Unnamed: 0.1,Unnamed: 0,rid,molecule,bond_index,fragment1,fragment2,bde,bdfe,bdscfe,cart,conj
2135,451102,,CSCCC1OCCO1,8,CSCC[CH]OCC[O],,82.610305,79.514803,86.575422,4.734607,9999
6489,209533,,CCC(=O)C1(O)CCCC1,8,[CH2]CCC([CH2])(O)C(=O)CC,,80.488069,76.234186,85.476282,5.002763,9999
4688,311598,,C1=CC12OCCO2,3,[CH2]COC1([O])C=C1,,67.604055,63.514578,72.195599,4.101081,9999
515,1502984,,NC1=NC[C@H](C(=O)O)N1,7,[NH]C(N)=NC[CH]C(=O)O,,67.860078,65.692035,71.459339,4.325361,9999
4629,192445,,O=c1c2c(c1=O)CC=CC2,5,[CH2]C=CCc1[c]c(=O)c1=O,,91.872965,88.053945,95.023589,5.54424,9999
4567,301053,,CCN1CCCNC1=O,4,[CH2]CN(CC)C(=O)N[CH2],,80.37198,76.435616,85.134358,4.353713,9999
5566,266177,,C=CS(=O)(=O)NC1COC1,7,[CH2]C(C[O])NS(=O)(=O)C=C,,58.913055,56.574329,62.861593,3.000856,9999
6741,357147,,C/C=C/CN1CCOCC1,8,[CH2]OCCN([CH2])C/C=C/C,,79.390556,74.471513,82.904691,5.211926,9999
5540,251121,,CCN1CC[C@@H](O)C1,2,[CH2]C[C@@H](O)C[N]CC,,71.628897,67.071927,75.985841,4.920386,9999
6222,393180,,CCO/C=C1/C=CCCC1,4,[CH]=CCCC[C]=COCC,,110.724591,107.143397,114.063749,5.285669,9999


In [54]:
# Save the 10 sampled to a csv so that they are not randomly sampled everytime the code runs
subset.to_csv('sampled_mols.csv')

In [55]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [56]:
# Make sure the sampled mols are looking right
sampled_mols_df = pd.read_csv('sampled_mols.csv')
sampled_mols_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,rid,molecule,bond_index,fragment1,fragment2,bde,bdfe,bdscfe,cart,conj
0,2135,451102,,CSCCC1OCCO1,8,CSCC[CH]OCC[O],,82.610305,79.514803,86.575422,4.734607,9999
1,6489,209533,,CCC(=O)C1(O)CCCC1,8,[CH2]CCC([CH2])(O)C(=O)CC,,80.488069,76.234186,85.476282,5.002763,9999
2,4688,311598,,C1=CC12OCCO2,3,[CH2]COC1([O])C=C1,,67.604055,63.514578,72.195599,4.101081,9999
3,515,1502984,,NC1=NC[C@H](C(=O)O)N1,7,[NH]C(N)=NC[CH]C(=O)O,,67.860078,65.692035,71.459339,4.325361,9999
4,4629,192445,,O=c1c2c(c1=O)CC=CC2,5,[CH2]C=CCc1[c]c(=O)c1=O,,91.872965,88.053945,95.023589,5.54424,9999
5,4567,301053,,CCN1CCCNC1=O,4,[CH2]CN(CC)C(=O)N[CH2],,80.37198,76.435616,85.134358,4.353713,9999
6,5566,266177,,C=CS(=O)(=O)NC1COC1,7,[CH2]C(C[O])NS(=O)(=O)C=C,,58.913055,56.574329,62.861593,3.000856,9999
7,6741,357147,,C/C=C/CN1CCOCC1,8,[CH2]OCCN([CH2])C/C=C/C,,79.390556,74.471513,82.904691,5.211926,9999
8,5540,251121,,CCN1CC[C@@H](O)C1,2,[CH2]C[C@@H](O)C[N]CC,,71.628897,67.071927,75.985841,4.920386,9999
9,6222,393180,,CCO/C=C1/C=CCCC1,4,[CH]=CCCC[C]=COCC,,110.724591,107.143397,114.063749,5.285669,9999


In [57]:
# Generating the 2D descriptors for the
class RDKit_2D:
    def __init__(self, smiles):
        self.mols = [Chem.MolFromSmiles(i) for i in smiles]
        self.smiles = smiles
        
    def compute_2Drdkit(self, name):
        rdkit_2D_desc = []
        calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
        header = calc.GetDescriptorNames()
        for i in range(len(self.mols)):
            ds = calc.CalcDescriptors(self.mols[i])
            rdkit_2D_desc.append(ds)
        df = pd.DataFrame(rdkit_2D_desc, columns = header)
        df.insert(loc = 0, column = 'smiles', value = self.smiles)
        df.to_csv(name[:-4]+'_RDKit_2D.csv', index = False)

In [58]:
# List of all the 2D descriptors created by RDKit
descriptors_list = [x[0] for x in Descriptors._descList]
print(descriptors_list)

['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'Slo

In [59]:
# Call for the entire list of 2D descriptors
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x [0] for x in Descriptors._descList])
calc

<rdkit.ML.Descriptors.MoleculeDescriptors.MolecularDescriptorCalculator at 0x132f48340>

In [60]:
# Create header for the output of the molecular descriptors
header = calc.GetDescriptorNames()
header

('MaxEStateIndex',
 'MinEStateIndex',
 'MaxAbsEStateIndex',
 'MinAbsEStateIndex',
 'qed',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'NumValenceElectrons',
 'NumRadicalElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'FpDensityMorgan1',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW',
 'BalabanJ',
 'BertzCT',
 'Chi0',
 'Chi0n',
 'Chi0v',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3n',
 'Chi3v',
 'Chi4n',
 'Chi4v',
 'HallKierAlpha',
 'Ipc',
 'Kappa1',
 'Kappa2',
 'Kappa3',
 'LabuteASA',
 'PEOE_VSA1',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA6',
 'SMR_VSA7',
 'SMR_

In [61]:
# Compute the descriptors for one SMILES string to ensure code above is working correctly
mol = Chem.MolFromSmiles('CC[C@]1(C(=O)O)CCO1')
ds = calc.CalcDescriptors(mol)
print(ds)

(10.409583333333334, -0.8229166666666667, 10.409583333333334, 0.5717592592592592, 0.5962602563502525, 130.14299999999997, 120.06299999999997, 130.06299418, 52, 0, 0.33554596669446907, -0.4791056629297487, 0.4791056629297487, 0.33554596669446907, 1.8888888888888888, 2.4444444444444446, 2.4444444444444446, 16.547737291094627, 9.921513033095788, 2.436219689382986, -2.3618651124224446, 2.2906249665455847, -2.4956855600690933, 5.7775878535582175, -0.1942021352442073, 2.56829198106863, 120.75488750216347, 6.905777393935817, 5.3850305199873265, 5.3850305199873265, 4.211142625940433, 3.0847437851817503, 3.0847437851817503, 2.251458265335147, 2.251458265335147, 1.8451317680504684, 1.8451317680504684, 0.9471965851081656, 0.9471965851081656, -0.5700000000000001, 73.06242953469413, 6.548623962040332, 1.9882386060247925, 0.8115463822431909, 53.62743152803001, 9.843390348640755, 0.0, 5.601050810983688, 0.0, 0.0, 5.969305287951849, 0.0, 4.794537184071822, 0.0, 0.0, 6.923737199690624, 6.42082162292600

In [62]:
# Now iterate for all the mols and organize them into pandas dataframe
def compute_2Drdkit(self, name):
    rdkit_2d_desc = []
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    header = calc.GetDescriptorNames()
    for i in range(len(self.mols)):
        ds = calc.CalcDescriptors(self.mols[i])
        rdkit_2d_desc.append(ds)
    df = pd.DataFrame(rdkit_2d_desc, columns=header)
    df.insert(loc = 0, column = 'smiles', value = self.smiles)
    df.to_csv(name[:-4]+'_RDKit_2D.csv', index = False)

In [63]:
from molvs import standardize_smiles

In [66]:
# Now import and use the class to compute the RDKit 2D descriptors
# They will save as their own csv to be uploaded
def main():
    filename = 'sampled_mols.csv'
    df = pd.read_csv('sampled_mols.csv')
    smiles = [standardize_smiles(i) for i in df['molecule'].values]
    
    # Compute RDKit_2D Fingerprints and export a csv file
    RDKit_descriptor = RDKit_2D(smiles)
    RDKit_descriptor.compute_2Drdkit('sampled_mols.csv')
    
if __name__ =='__main__':
    main()

### Now we want to merge the two dataframes with the smiles and the descriptors (use inner join)

In [88]:
dirads = pd.read_csv('rdf_sampled_dirads_df.csv') 
parents = pd.read_csv('rdf_parent_desc.csv')     

In [89]:
descriptors = [dirads, parents]
result = pd.concat(descriptors)

In [90]:
dirads

Unnamed: 0.1,Unnamed: 0,rid,molecule,bond_index,fragment1,fragment2,bde,bdfe,bdscfe,cart,conj
0,319292,,CC(C)N1CCOC1=O,3,[CH2]COC(=O)[N]C(C)C,,85.259020,80.915403,89.557518,4.593091,9999
1,267257,,CC(C)[C@@H](C)SC1COC1,7,[CH2]C(C[O])S[C@H](C)C(C)C,,58.543452,57.581481,62.370026,3.701196,9999
2,253493,,CCC1=CNCCC1,7,[CH2]CCNC=[C]CC,,97.037992,91.984662,100.944869,5.107230,9999
3,192932,,c1nnc2n1CCCCC2,5,[CH2]CCCc1nncn1[CH2],,81.884277,76.576806,86.934183,5.370440,9999
4,323812,,O=C1CC[C@H](CO)O1,2,[CH2]C(=O)O[C@H]([CH2])CO,,79.525471,76.566766,83.731931,4.610139,9999
...,...,...,...,...,...,...,...,...,...,...,...
9995,296557,,C[C@@H]1CN2CC[C@@H]1O2,5,[CH2]CN1C[C@@H](C)[CH]O1,,76.236696,72.335472,80.380980,4.179693,9999
9996,276237,,CSC[C@H]1CCC[C@@H](C)C1,8,[CH2][C@H](CCC[CH]C)CSC,,86.507136,80.937993,91.039056,6.338257,9999
9997,317679,,N[C@@H]1CCO[C@@H]1O,1,[CH2]CO[C@H](O)[CH]N,,75.934864,72.090116,79.830818,4.676913,9999
9998,360951,,C[C@@H]1CCC(=O)OC1,7,[CH2]OC(=O)CC[CH]C,,77.233180,72.222521,81.584260,5.156724,9999


In [91]:
parents

Unnamed: 0,smiles,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC(C)N1CCOC1=O,10.719444,-0.175926,10.719444,0.175926,0.524873,129.159,118.071,129.078979,52,...,0,0,0,0,0,0,0,0,0,0
1,CC(C)[C@@H](C)SC1COC1,5.094650,0.786019,5.094650,0.786019,0.625460,160.282,144.154,160.092186,60,...,1,0,0,0,0,0,0,0,0,0
2,CCC1=CNCCC1,3.223704,1.170139,3.223704,1.170139,0.542358,111.188,98.084,111.104799,46,...,0,0,0,0,0,0,0,0,0,0
3,c1nnc2n1CCCCC2,4.029722,1.111111,4.029722,1.111111,0.533848,137.186,126.098,137.095297,54,...,0,0,0,0,0,0,0,0,0,0
4,O=C1CC[C@H](CO)O1,10.279028,-0.219907,10.279028,0.035972,0.477612,116.116,108.052,116.047344,46,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,C[C@@H]1CN2CC[C@@H]1O2,5.417824,0.564815,5.417824,0.564815,0.456866,113.160,102.072,113.084064,46,...,0,0,0,0,0,0,0,0,0,0
9996,CSC[C@H]1CCC[C@@H](C)C1,2.395324,1.005463,2.395324,1.005463,0.594564,158.310,140.166,158.112922,60,...,1,0,0,0,0,0,0,0,0,0
9997,N[C@@H]1CCO[C@@H]1O,8.650463,-0.708333,8.650463,0.152778,0.415441,103.121,94.049,103.063329,42,...,0,0,0,0,0,0,0,0,0,0
9998,C[C@@H]1CCC(=O)OC1,10.386389,-0.038796,10.386389,0.038796,0.438294,114.144,104.064,114.068080,46,...,0,0,0,0,0,0,0,0,0,0


In [92]:
parents.rename(columns={'smiles':'molecule'}, inplace=True)

In [160]:
parents = parents.merge(dirads, on='molecule', how='left')

In [161]:
parents.to_csv('rdf_sampled_with_desc.csv', index=False)

### Now it's time to make a random forest 

In [192]:
rdf_desc = pd.read_csv('rdf_sampled_with_desc.csv')
rdf_desc

  rdf_desc = pd.read_csv('rdf_sampled_with_desc.csv')


Unnamed: 0.1,molecule,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,Unnamed: 0,rid,bond_index,fragment1,fragment2,bde,bdfe,bdscfe,cart,conj
0,CC(C)N1CCOC1=O,10.719444,-0.175926,10.719444,0.175926,0.524873,129.159,118.071,129.078979,52,...,319292,,3,[CH2]COC(=O)[N]C(C)C,,85.259020,80.915403,89.557518,4.593091,9999
1,CC(C)[C@@H](C)SC1COC1,5.094650,0.786019,5.094650,0.786019,0.625460,160.282,144.154,160.092186,60,...,267257,,7,[CH2]C(C[O])S[C@H](C)C(C)C,,58.543452,57.581481,62.370026,3.701196,9999
2,CCC1=CNCCC1,3.223704,1.170139,3.223704,1.170139,0.542358,111.188,98.084,111.104799,46,...,253493,,7,[CH2]CCNC=[C]CC,,97.037992,91.984662,100.944869,5.107230,9999
3,CCC1=CNCCC1,3.223704,1.170139,3.223704,1.170139,0.542358,111.188,98.084,111.104799,46,...,213889,,4,[CH2]CCC(=C[NH])CC,,66.383550,61.883683,70.311311,4.428669,9999
4,CCC1=CNCCC1,3.223704,1.170139,3.223704,1.170139,0.542358,111.188,98.084,111.104799,46,...,393113,,3,[CH]=C(CC)CCC[NH],,96.741808,92.954163,100.804745,5.073219,9999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14419,CSC[C@H]1CCC[C@@H](C)C1,2.395324,1.005463,2.395324,1.005463,0.594564,158.310,140.166,158.112922,60,...,276237,,8,[CH2][C@H](CCC[CH]C)CSC,,86.507136,80.937993,91.039056,6.338257,9999
14420,N[C@@H]1CCO[C@@H]1O,8.650463,-0.708333,8.650463,0.152778,0.415441,103.121,94.049,103.063329,42,...,317679,,1,[CH2]CO[C@H](O)[CH]N,,75.934864,72.090116,79.830818,4.676913,9999
14421,C[C@@H]1CCC(=O)OC1,10.386389,-0.038796,10.386389,0.038796,0.438294,114.144,104.064,114.068080,46,...,276752,,6,[CH2][C@H](C)CCC([O])=O,,85.413388,80.867085,89.604846,6.055404,9999
14422,C[C@@H]1CCC(=O)OC1,10.386389,-0.038796,10.386389,0.038796,0.438294,114.144,104.064,114.068080,46,...,360951,,7,[CH2]OC(=O)CC[CH]C,,77.233180,72.222521,81.584260,5.156724,9999


In [193]:
rdf_desc.drop(columns=['molecule', 'fragment1', 'fragment2', 'rid'], axis = 1, inplace=False)
rdf_desc

Unnamed: 0.1,molecule,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,Unnamed: 0,rid,bond_index,fragment1,fragment2,bde,bdfe,bdscfe,cart,conj
0,CC(C)N1CCOC1=O,10.719444,-0.175926,10.719444,0.175926,0.524873,129.159,118.071,129.078979,52,...,319292,,3,[CH2]COC(=O)[N]C(C)C,,85.259020,80.915403,89.557518,4.593091,9999
1,CC(C)[C@@H](C)SC1COC1,5.094650,0.786019,5.094650,0.786019,0.625460,160.282,144.154,160.092186,60,...,267257,,7,[CH2]C(C[O])S[C@H](C)C(C)C,,58.543452,57.581481,62.370026,3.701196,9999
2,CCC1=CNCCC1,3.223704,1.170139,3.223704,1.170139,0.542358,111.188,98.084,111.104799,46,...,253493,,7,[CH2]CCNC=[C]CC,,97.037992,91.984662,100.944869,5.107230,9999
3,CCC1=CNCCC1,3.223704,1.170139,3.223704,1.170139,0.542358,111.188,98.084,111.104799,46,...,213889,,4,[CH2]CCC(=C[NH])CC,,66.383550,61.883683,70.311311,4.428669,9999
4,CCC1=CNCCC1,3.223704,1.170139,3.223704,1.170139,0.542358,111.188,98.084,111.104799,46,...,393113,,3,[CH]=C(CC)CCC[NH],,96.741808,92.954163,100.804745,5.073219,9999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14419,CSC[C@H]1CCC[C@@H](C)C1,2.395324,1.005463,2.395324,1.005463,0.594564,158.310,140.166,158.112922,60,...,276237,,8,[CH2][C@H](CCC[CH]C)CSC,,86.507136,80.937993,91.039056,6.338257,9999
14420,N[C@@H]1CCO[C@@H]1O,8.650463,-0.708333,8.650463,0.152778,0.415441,103.121,94.049,103.063329,42,...,317679,,1,[CH2]CO[C@H](O)[CH]N,,75.934864,72.090116,79.830818,4.676913,9999
14421,C[C@@H]1CCC(=O)OC1,10.386389,-0.038796,10.386389,0.038796,0.438294,114.144,104.064,114.068080,46,...,276752,,6,[CH2][C@H](C)CCC([O])=O,,85.413388,80.867085,89.604846,6.055404,9999
14422,C[C@@H]1CCC(=O)OC1,10.386389,-0.038796,10.386389,0.038796,0.438294,114.144,104.064,114.068080,46,...,360951,,7,[CH2]OC(=O)CC[CH]C,,77.233180,72.222521,81.584260,5.156724,9999


In [204]:
# W
rdf_desc.set_index(keys, inplace = True)

NameError: name 'keys' is not defined

In [194]:
# Labels are the values we want to predict
labels = np.array(dirads['bde'])

# Remove the labels from the features
# axis 1 refers to the columns
features = dirads.drop('bde', axis = 1)

# Saving the features name for later
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [195]:
# Use Skicit-learn to split the data into training and test data
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [196]:
# Look at the shape of the data to make sure everything split correctly
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (7500, 10)
Training Labels Shape: (7500,)
Testing Features Shape: (2500, 10)
Testing Labels Shape: (2500,)


In [207]:
# Baseline predictions
baseline_preds = test_features[:, feature_list.index('molecule')] # Need to figure out what to call the index
                                                                  # so that it can be used in the math below

# Baseline errors
baseline_errors = abs(baseline_preds - test_labels)     # Error :( do I need to float the variables?

print('Average baseline error: ', round(np.mean(baseline_errors), 2))

AttributeError: 'numpy.ndarray' object has no attribute 'set_index'

In [174]:
# Import the model being used
from sklearn.ensemble import RandomForestRegressor

# Instantiate the model with n decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)

# Train the model from the training data
rf.fit(train_features, train_labels);

ValueError: could not convert string to float: 'CC(=O)/C=C(/C)C1CC1'