In [1]:
# importing modules and packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing

In [2]:
# Import the dataset from the csv file.
import pandas as pd
df = pd.read_csv('reduced_dim_space_ddG.csv')

In [3]:
# We have lot of rows that have Enantiomeric Excess == 0. These ones we will exclude from the dataset.
df_cleaned = df[df['ddG (% ee)'] != 0]
df_cleaned

Unnamed: 0,Catalyst,x,y,z,ddG (% ee)
0,1_1_1,5.100125,-27.742489,-17.922393,1.226289
1,1_1_2,5.153813,-27.571266,-17.950507,0.719002
10,1_11_1,24.913739,-0.494436,-8.842801,0.631877
16,1_2_1,12.039033,-10.467777,-6.094131,0.216792
17,1_2_2,11.859191,-10.604692,-6.049362,0.111677
...,...,...,...,...,...
1810,9_3_4,36.077375,-7.875347,-14.939623,0.128432
1813,9_4_1,25.574518,-17.376362,4.663637,0.912247
1831,9_7_1,22.777582,-19.792283,2.750864,0.056902
1834,9_7_4,22.721846,-19.736566,2.713267,0.408758


In [4]:
# Lets separate out the x and y 
X = df_cleaned[['x','y','z']]
y = df_cleaned['ddG (% ee)']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=101)

In [6]:
# with sklearn
# creating a regression model
model = LinearRegression()
  
# fitting the model
model.fit(X_train, y_train)
  
# making predictions
predictions = model.predict(X_test)

# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))

mean_squared_error :  0.10686574075223941
mean_absolute_error :  0.275571383948475


In [7]:
print('Intercept: \n', model.intercept_)
print('Coefficients: \n', model.coef_)

Intercept: 
 0.4323468984662589
Coefficients: 
 [-0.00122995 -0.00340784 -0.00199706]


In [8]:
sample_predict = np.array([-48.68710019,27.57547822,53.74597739])
model.predict(sample_predict.reshape(1, -1))



array([0.29092283])

In [9]:
coef =  [-0.00122995, -0.00340784, -0.00199706]
intercept = 0.4323468984662589
np.dot(sample_predict, coef) + model.intercept_

0.2909228380412312

In [10]:
from sklearn.cross_decomposition import PLSRegression
model = PLSRegression(n_components=2)
# fitting the model
model.fit(X_train, y_train)
  
# making predictions
predictions = model.predict(X_test)

# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))

mean_squared_error :  0.1063480698362802
mean_absolute_error :  0.27497913508936606


In [11]:
from joblib import dump, load
dump(model, 'pls.joblib') 
clf2 = load('pls.joblib') 
clf2.predict(X[0:1])

array([[0.55539496]])