In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics

import sys
sys.path.insert(1, '../src/stproject')
from utils import *

In [2]:
# importing feature-engineered dataset v1 (df_fe1.csv)

df_fe0 = pd.read_csv('../data/df_fe0.csv', index_col=0)

# computing new feature: molar volume (mv = M/density)
df_fe0['mv'] = df_fe0['M']/df_fe0['density']
print(len(df_fe0))
df_fe0.head()

268


Unnamed: 0,C,H,C=C,C#C,Ar,O-alc,O-eth,O-ald,O-ket,O-acid,...,R4,R5,R6,R7,R8,M,measured_st,molecule,density,mv
1,5.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,72.151,15.5,Pentane,0.62638,115.187267
2,6.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,86.178,18.0,Hexane,0.6594,130.691538
3,7.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,100.205,19.8,Heptane,0.6842,146.455715
4,8.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,114.232,21.1,Octane,0.7031,162.469066
5,9.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,128.259,22.4,Nonane,0.7176,178.733278


## Linear model for mv

In [3]:
# exclusion of samples with certain motifs due to lack of datapoints and/or irrelevance to polyester

feature_exclude = ['C#C', 'R3', 'R4', 'R7', 'R8', 'O-alc', 'O-ald', 'O-acid']
df_fe0 = df_fe0[df_fe0[feature_exclude].sum(axis=1) == 0]
len(df_fe0)

196

In [4]:
st = df_fe0['measured_st']
mv = df_fe0['mv']

In [5]:
X_mv = df_fe0.drop(['M', 'measured_st', 'molecule', 'density', 'mv']+feature_exclude, axis=1)

lm_mv = LinearRegression()
lm_mv.fit(X_mv, mv)

mv_hat = lm_mv.predict(X_mv)
regression_results(mv, mv_hat)
coef_mv = (pd.DataFrame({'var': ['intercept']+X_mv.columns.tolist(), 'coef': [lm_mv.intercept_]+list(lm_mv.coef_)}))

mv_hat = lm_mv.predict(X_mv)
print(coef_mv)

explained_variance:  0.999
mean_squared_log_error:  0.0003
r2:  0.999
MAE:  1.7968
MSE:  6.8842
RMSE:  2.6238
         var       coef
0  intercept  18.406905
1          C   6.270087
2          H   5.156882
3        C=C   3.036543
4         Ar   1.872124
5      O-eth   6.053857
6      O-ket   4.300755
7    O-ester   7.121618
8         R5  -6.019664
9         R6  -8.675293


### Linear model 0 - parachor model, gamma = (P/V)^4, with mv

In [34]:
n = 4
X0 = df_fe0.drop(['M', 'measured_st', 'molecule', 'density', 'mv']+feature_exclude, axis=1)
y0 = (st**(1/n)) * mv

In [35]:
lm0 = LinearRegression()
lm0.fit(X0, y0)
y0_hat = lm0.predict(X0)

regression_results(st, (y0_hat/mv)**n)
coef0 = (pd.DataFrame({'var': ['intercept']+X0.columns.tolist(), 'coef': [lm0.intercept_]+list(lm0.coef_)}))

print(coef0)

explained_variance:  0.8486
mean_squared_log_error:  0.0032
r2:  0.8486
MAE:  0.9344
MSE:  2.7794
RMSE:  1.6671
         var       coef
0  intercept  19.059386
1          C  22.526971
2          H   8.205150
3        C=C   5.904035
4         Ar   1.046578
5      O-eth  20.153792
6      O-ket  20.927924
7    O-ester  19.737947
8         R5  -9.378928
9         R6 -13.083921


### Linear model 1 - with features(molecular fragments) normalized by M

In [41]:
X1 = (df_fe0.drop(['M', 'measured_st', 'molecule', 'density', 'mv']+feature_exclude, axis=1)
                   .divide(df_fe0['M'], axis=0))
X1.head()

Unnamed: 0,C,H,C=C,Ar,O-eth,O-ket,O-ester,R5,R6
1,0.069299,0.166318,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.069623,0.162454,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.069857,0.159673,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.070033,0.157574,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.070171,0.155934,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
lm1 = LinearRegression()
lm1.fit(X1, st)
y1_hat = lm1.predict(X1)

regression_results(st, y1_hat)
coef1 = (pd.DataFrame({'var': ['intercept']+X1.columns.tolist(), 'coef': [lm1.intercept_]+list(lm1.coef_)}))

print(coef1)

explained_variance:  0.8615
mean_squared_log_error:  0.0035
r2:  0.8615
MAE:  1.2321
MSE:  2.5429
RMSE:  1.5947
         var         coef
0  intercept    36.590011
1          C  1101.762174
2          H  -597.759235
3        C=C -1017.403525
4         Ar -3618.967383
5      O-eth   132.321574
6      O-ket  -530.387910
7    O-ester  -391.403266
8         R5  -633.699916
9         R6  -544.393156
