# read in a file with data on 193 hydrocarbons

In [None]:
# taken from ESCIP + MOLSSI python workshop

In [None]:
import pandas as pd
df = pd.read_csv("data/hydrocarbons.csv")
df

In [None]:
from rdkit.Chem import PandasTools
PandasTools.AddMoleculeColumnToFrame(df, "smiles", "Molecule", includeFingerprints=True)
from rdkit.Chem import Descriptors
df["NumAromaticRings"] = df["Molecule"].apply(Descriptors.NumAromaticRings)
df["NumValenceElectrons"] = df["Molecule"].apply(Descriptors.NumValenceElectrons)
df["MolWt"] = df["Molecule"].apply(Descriptors.MolWt)
df.dropna(axis=0, how="any", inplace=True)

# let's see if any of the properties of a molecule are correlated

In [None]:
import seaborn as sns
sns.heatmap(df.corr(), cmap="rocket_r", annot=True)

# plot two properties vs each other to see if they look correlated

In [None]:
sns.lmplot(x='NumValenceElectrons',y="Boiling point", data=df)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
randomforest= RandomForestRegressor()

# train the model to predict one property based on another

In [None]:
# choose any combination you think are correlated:
# numValenceElectrons, numAromaticRings, Boiling point, Melting point, MolWt
X = df[["NumValenceElectrons"]]
Y = df[["Boiling point"]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
randomforest.fit(X_train,Y_train)

# test the model.  Does your X value predict the Y value?

In [None]:
df['predicted'] = randomforest.predict(X)
sns.lmplot(x='predicted',y="Boiling point", data=df)