In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics

In [2]:
# importing feature-engineered dataset v1 (df_fe1) and appending density data from 'main' dataset (df_stliq_clean)

df_fe1 = pd.read_csv('../data/df_fe1.csv', index_col=0)
df_stliq_clean = pd.read_csv('../data/df_stliq_clean.csv', index_col=0)
df_fe1['density'] = df_stliq_clean['density']

# computing new feature: molar volume (mv = M/density)
df_fe1['mv'] = df_fe1['M']/df_fe1['density']
print(len(df_fe1))
df_fe1.head()

269


Unnamed: 0,C,CH,CH2,CH3,C-ring,CH-ring,CH2-ring,C=C,C#C,Ar,...,C-O-C,CHO,CO,COOR,M,COOH,measured_st,molecule,density,mv
1,0,0,3,2,0,0,0,0,0,0,...,0,0,0,0,72.151,0,15.5,Pentane,0.62638,115.187267
2,0,0,4,2,0,0,0,0,0,0,...,0,0,0,0,86.178,0,18.0,Hexane,0.6594,130.691538
3,0,0,5,2,0,0,0,0,0,0,...,0,0,0,0,100.205,0,19.8,Heptane,0.6842,146.455715
4,0,0,6,2,0,0,0,0,0,0,...,0,0,0,0,114.232,0,21.1,Octane,0.7031,162.469066
5,0,0,7,2,0,0,0,0,0,0,...,0,0,0,0,128.259,0,22.4,Nonane,0.7176,178.733278


In [3]:
# exclusion of samples with certain motifs due to lack of datapoints and/or irrelevance to polyester

feature_exclude = ['C#C', 'C-ring', 'OH', 'CHO', 'COOH']
df_fe1 = df_fe1[df_fe1[feature_exclude].sum(axis=1) == 0]
len(df_fe1)

199

In [4]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [5]:
# # X and y for linear model

# X = df_fe1.drop(['M', 'measured_st', 'molecule', 'density', 'mv', 'C-ring', 'C#C'], axis=1)
# y = df_fe1['measured_st']

In [6]:
# # Linear model 1
# lm1 =  LinearRegression()
# lm1.fit(X, y)

In [7]:
# # evaluation of linear model 1
# y_hat1 = lm1.predict(X)
# regression_results(y, y_hat1)

In [8]:
# # linear model 2
# # using parachor concept, scaling y dow to the power of 1/4 and multiplying by mv
# y_para = df_fe1['measured_st'] * df_fe1['mv']
# lm2 = LinearRegression()
# lm2.fit(X, y_para)

In [9]:
# # evaluation of linear model 2
# y_hat2 = lm2.predict(X)/df_fe1['mv']
# regression_results(y, y_hat2)

In [10]:
# i_neg = y_hat2.index[y_hat2<0]
# i_neg

Features of 'furan' (index=128) was not constructed successfully, values are 0 for all features.
Regression predicts negative value

In [11]:
# exclusion of furan from next models
df_fe1 = df_fe1[~df_fe1['molecule'].str.contains('Furan')]
st = df_fe1['measured_st']
mv = df_fe1['mv']

### LINEAR MODEL MV - for molar volume

In [12]:
X_mv = df_fe1.drop(['M', 'measured_st', 'molecule', 'density', 'mv']+feature_exclude, axis=1)

lm_mv = LinearRegression()
lm_mv.fit(X_mv, mv)

mv_hat = lm_mv.predict(X_mv)
regression_results(mv, mv_hat)
coef_mv = (pd.DataFrame({'var': ['intercept']+X_mv.columns.tolist(), 'coef': [lm_mv.intercept_]+list(lm_mv.coef_)}))
print(coef_mv)

explained_variance:  0.9989
mean_squared_log_error:  0.0004
r2:  0.9989
MAE:  1.9497
MSE:  7.3507
RMSE:  2.7112
          var       coef
0   intercept  26.890752
1           C  13.268068
2          CH  15.800499
3         CH2  16.532250
4         CH3  17.769664
5     CH-ring  13.139330
6    CH2-ring  13.221631
7         C=C  24.599463
8          Ar  60.442562
9       C-O-C   5.524766
10         CO   9.543142
11       COOR  20.443835


### LINEAR MODEL 3 - no furan, scaling of surface tension by molar volume

In [13]:
# linear model 3 - no furan, parachor concept
# parachor concept: scaling y down to the power of 1/n and multiplying by mv

n=1.66
X3 = df_fe1.drop(['M', 'measured_st', 'molecule', 'density', 'mv']+feature_exclude, axis=1)
y3 = (st**(1/n)) * mv

In [14]:
lm3 = LinearRegression()
lm3.fit(X3, y3)

LinearRegression()

In [15]:
# evaluation of linear model 3
y_hat3 = lm3.predict(X3)
regression_results(st, (y_hat3/mv)**n)

explained_variance:  0.9237
mean_squared_log_error:  0.0019
r2:  0.9236
MAE:  0.8496
MSE:  1.3868
RMSE:  1.1776


In [16]:
# coefficients (y = a0 + a1*x1 + ...    with y = st^(1/n)*mv)
coef3 = (pd.DataFrame({'var': ['intercept']+X3.columns.tolist(), 'coef': [lm3.intercept_]+list(lm3.coef_)}))
print(coef3)

          var        coef
0   intercept   -0.454670
1           C   94.486458
2          CH  115.196042
3         CH2  131.976561
4         CH3  124.568476
5     CH-ring  107.525784
6    CH2-ring  125.049083
7         C=C  240.578175
8          Ar  675.133737
9       C-O-C   89.271399
10         CO  222.607545
11       COOR  260.042527


### LINEAR MODEL 4 - linear model 3, ester only

In [17]:
df_fe1_ester = df_fe1[df_fe1['COOR'] > 0]

# dropping underepresented features (nonzero data point < 2)

features = df_fe1_ester.drop(['measured_st', 'molecule', 'density', 'mv', 'M'], axis=1).columns.tolist()
features_ester = [f for f in features if np.sum(df_fe1_ester[f] > 0) > 1]
features_not_in_ester = list(set(features) - set(features_ester))
print(f"features included = {features_ester}")
rows_ester = df_fe1_ester.index[df_fe1_ester[features_not_in_ester].sum(axis=1) == 0].tolist()
st_ester = df_fe1_ester.loc[rows_ester, 'measured_st']
mv_ester = df_fe1_ester.loc[rows_ester, 'mv']

n=1.35
X4 = df_fe1_ester.loc[rows_ester, features_ester]
y4 = (st_ester**(1/n)) * mv_ester

features included = ['C', 'CH', 'CH2', 'CH3', 'C=C', 'Ar', 'CO', 'COOR']


In [18]:
lm4 = LinearRegression()
lm4.fit(X4, y4)

LinearRegression()

In [19]:
# evaluation of linear model 4
y_hat4 = lm4.predict(X4)
regression_results(st_ester, (y_hat4/mv_ester)**n)

explained_variance:  0.893
mean_squared_log_error:  0.0016
r2:  0.893
MAE:  0.8503
MSE:  1.3405
RMSE:  1.1578


### Grid search for the best n