# ML full codes
**Congratulations! You're almost at the finish line.**
A typical machine learning task in chemistry consists of three steps:
- Import and explore data
- Split data, select inputs and labels, train and evaluate
- Plot and Analyze data 

## Import and Explore data

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from matplotlib import pyplot as plt 

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error as rmse
from sklearn.metrics import mean_absolute_error as mae
from scipy.stats import pearsonr


### Read sahu datasets

In [2]:
sahu = pd.read_csv("Sahu_original_dataset.csv", encoding = 'cp1252', on_bad_lines = 'skip')
# sahu.head()
# print(len(sahu))

**Firstly, we get additional descriptor we need from the mordred.web sahu.smi file. This group selection includes.**

[hydrogenbond, Lipinski, polarizability (apop and bpol), Vandarwalls, Molecular Weight (atomic molecular wt.]

In [3]:
# read the new descriptor data!

new_file = pd.read_csv("New_descriptor.csv", encoding = 'cp1252', on_bad_lines = 'skip')
new_file.insert(loc=0, column='sn', value=np.arange(len(new_file)))

# let's drop some of the columns because we don't need them!
new_file = new_file.drop(columns=['name','AMW','GhoseFilter'])
# new_file.head()
# print(len(new_file)) # There are 261 molecules where as sahu has 281 molecules.

**Sahu dataset has 281 molecules. However, the sahu.smi does not read all the 281 files, it only reads upto 261 molecules. Therefore, we will locate those molecules and delete those from the orginal sahu datasets** 


In [4]:
# drop the rows that are not read by the mordred.web 
row_drop = [6,17,18,22,28,40,58,66,116,156,172,174,184,269,270,271,273,277,278]

# we can simply use the .drop method to drop the rows.
sahu_new_descriptor = sahu.drop(row_drop)

# adding the sn so that we can merge the table using same key
sahu_new_descriptor.insert(loc=0, column='sn', value=np.arange(len(sahu_new_descriptor)))

# let's drop some of the columns because we don't need them!
sahu_new_descriptor = sahu_new_descriptor.drop(columns=['#Sno.'])

# print(len(sahu_new_descriptor)) # we get the exact 261 molecules now
# sahu_new_descriptor.head()

**At this point, we have downloaded new descriptor and made an indentical file to the original sahu datasets, we may want to merge these two files with index as common merging items**

In [19]:
# let's merge these two dataframe: we do this using pd.concat([file1,file2],axix=1 or 0, 1 for rows 0 for columns)

new_merged_df = pd.merge(sahu_new_descriptor,new_file,on='sn')
# new_merged_df.columns
# len(new_merged_df)
# new_merged_df.head()

**Data are all cleaned and ready to be processed now!**

* link to the descriptor: https://mordred-descriptor.github.io/documentation/master/descriptors.html

In [6]:
# np.percentile(sahu['PCE'], 25)

In [7]:
def mlonemodel(model, df, target):
    train_inputs, train_labels, test_inputs, test_labels = split(df, target)
    pred_train, pred_test, evaluation = trainevaluate(model, train_inputs, train_labels, test_inputs, test_labels)
    return train_labels, test_labels, pred_train, pred_test, evaluation

def split(df, target): #StratifiedShuffleSplit with 4 bins
    
    bins = [-np.inf, np.percentile(df[target],25), np.percentile(df[target],50), np.percentile(df[target],75), np.inf]
    labels = [1, 2, 3, 4]
    
    df['cat'] = pd.cut(df[target], bins=bins, labels=labels)
    df['cat'].hist(bins=20)
    #Preparing testing set (20% of instances) and training set using StratifiedShuffleSplit 
    # --> preserve the distribution!

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=22)
    for train_index, test_index in sss.split(df, df['cat']):
        strat_train_set = df.loc[train_index]
        strat_test_set = df.loc[test_index]

    # Drop column 'category' in the dataset    
    df = df.drop(columns='cat')
    strat_train_set = strat_train_set.drop(columns='cat')
    strat_test_set = strat_test_set.drop(columns='cat')
    
    # Get the location of the target column
    target_loc = df.columns.get_loc(target)
    input_loc = target_loc + 1
    
    # Get train_inputs, train_labels, test_inputs, test_labels
    train_inputs = strat_train_set.iloc[:, input_loc:]  # inputs
    train_labels = strat_train_set.iloc[:, target_loc]  # property that we want to predict

    test_inputs = strat_test_set.iloc[:, input_loc:]  # inputs
    test_labels = strat_test_set.iloc[:, target_loc]  # property that we want to predict
    return train_inputs, train_labels, test_inputs, test_labels

def trainevaluate(model, train_inputs, train_labels, test_inputs, test_labels):
    model.fit(train_inputs, train_labels)
    pred_train = model.predict(train_inputs)
    pred_test = model.predict(test_inputs)
    
    pearson_train = round(pearsonr(train_labels, pred_train)[0], 2)
    rmse_train = round(rmse(train_labels, pred_train, squared=False), 2)
    mae_train = round(mae(train_labels, pred_train), 2)

    pearson_test = round(pearsonr(test_labels, pred_test)[0], 2)
    rmse_test = round(rmse(test_labels, pred_test, squared=False), 2)
    mae_test = round(mae(test_labels, pred_test), 2)
    
    # Pay attention to the order of each parameter in this evaluation list
    evaluation = [pearson_train, rmse_train, mae_train, pearson_test, rmse_test, mae_test]
    
    return pred_train, pred_test, evaluation

## Bonus: predictions with multiple model as once
I let you figure out how this function works

In [8]:
def mlmultimodel(model, df, target):
    pred_train = []
    pred_test = []
    evaluation = []
    train_inputs, train_labels, test_inputs, test_labels = split(df, target)
    
    for i in range(0, len(model)):
        results = trainevaluate(model[i], train_inputs, train_labels, test_inputs, test_labels)
        pred_train.append(results[0])
        pred_test.append(results[1])
        evaluation.append(results[2])
    return train_labels, test_labels, pred_train, pred_test, evaluation

## Plotting
I made a function to plot pred_train vs train_labels and pred_test vs test_labels. I explain briefly the codes so that you can make changes to have other functions to plot what you want.

In [9]:
def plotonemodel(prop, pred_train, train_labels, pearson_train, pred_test, test_labels, pearson_test):
    plt.figure(figsize=(6,4), dpi=200)
    
    # Plot trainset as a scatter plot
    trainlabel = 'Training set, Pearson r = ' + str(pearson_train)
    plt.scatter(pred_train, train_labels, color='red', label=trainlabel)
    
    # Plot testset as a scatter plot
    testlabel = 'Testing set, Pearson r = ' + str(pearson_test)
    plt.scatter(pred_test, test_labels, color='blue', label=testlabel)
    
    # Fining maximum and minimum for both x and y-axis
    xymin = round(min(pred_train.min(), train_labels.min(), pred_test.min(), test_labels.min()))
    xymax = round(max(pred_train.max(), train_labels.max(), pred_test.max(), test_labels.max()))
    
    # set min and max for x and y-axis
    plt.xlim([xymin, xymax])
    plt.ylim([xymin, xymax])
    
    # Drawing the dashed line y=x on the plot
    plt.axline([xymin, xymin], [xymax, xymax], color='black', linestyle='--')
    
    # Customize your plot
    xlabel = 'Experimental ' + prop
    ylabel = 'ML predicted ' + prop
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    
    return

# Checking if these functions work

## Checking if the mlonemodel works

In [10]:
# lr
# plotonemodel('PCE', pred_lr_train, train_labels, evaluation_lr[0], pred_lr_test, test_labels, evaluation_lr[3])


## Checking if the mlmultimodel works

In [11]:
# lr = LinearRegression()
# rf = RandomForestRegressor()
# ann = MLPRegressor()
# # dt = DecisionTreeRegressor()
# model = [lr, rf, ann]

# train_labels, test_labels, pred_train, pred_test, evaluation = mlmultimodel(model, sahu, 'PCE')

In [12]:
# columns = ['pearson_train', 'rmse_train', 'mae_train', 'pearson_test', 'rmse_test', 'mae_test']
# index = ['lr', 'rf', 'ann']

# eval_df = pd.DataFrame(evaluation, index=index, columns=columns)
# eval_df
# sahu.feature_importance_

In [13]:
# evaluation

In [14]:
# evaluation[0][0]

In [15]:
# plot for linear regresstion model
# plotonemodel('PCE', pred_train[0], train_labels, evaluation[0][0], pred_test[0], test_labels, evaluation[0][3])


In [16]:
#plot for ann model
# plotonemodel('PCE', pred_train[2], train_labels, evaluation[2][0], pred_test[2], test_labels, evaluation[2][3])


In [17]:
# rf
# plotonemodel('PCE', pred_train[1], train_labels, evaluation[1][0], pred_test[1], test_labels, evaluation[1][3])


In [18]:
# dt
# plotonemodel('PCE', pred_train[3], train_labels, evaluation[3][0], pred_test[3], test_labels, evaluation[3][3])
