In [None]:
import pandas as pd
import numpy as np

xls = pd.ExcelFile('cleandataset.xlsx')

df1 = pd.read_excel(xls, 'nonxtb')
df2 = pd.read_excel(xls, 'xtb')

print(df1.columns)
print(df2.columns)

In [None]:
from sklearn import linear_model
from sklearn import metrics
from matplotlib import pyplot

def magnify():
    
    return [dict(selector="th",
                 props=[("font-size", "12pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '400px'),
                        ('font-size', '12pt')])]


def get_lr_model (featlist, label, df):

     x = df[featlist].values
     y = df[label].values

     #print(x.shape)
     #print(y.shape)
 
     # with sklearn
     regr = linear_model.LinearRegression()
     regr.fit(x, y)

     y_pred = regr.predict(x)

     """
     y_pred = []
     for idx, xval in enumerate(x):
          ypred = regr.intercept_
          for i, xi in enumerate(xval):
               ypred += xi*regr.coef_[i]
          y_pred.append(ypred)
     """

     mse = metrics.mean_squared_error (y, y_pred)
     r2 = metrics.r2_score (y, y_pred)

     return y, y_pred, regr, mse, r2

def print_and_plot(featlist, y, y_pred, regr, mse, r2):
     
     print('Intercept: \n', regr.intercept_)
     for idx, f in enumerate(featlist):
           print("Coeff %10s %10.4f "%(f,regr.coef_ [idx]))

     print("MSE: %10.5f R2: %10.5f"%(mse, r2))

     #pyplot.scatter(y, y_pred)
     #pyplot.xlabel("True Values")
     #pyplot.ylabel("Predicted Values")
     #pyplot.show()

In [None]:
import seaborn as sns
%matplotlib inline

# calculate the correlation matrix
corr = df1.corr()

cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)
corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '100px', 'font-size': '12pt'})\
    .set_caption("Correlation")\
    .set_precision(2)\
    .set_table_styles(magnify())

In [None]:

featlist = ['DEmFMO2', 'F2LE', 'PIEmFMO2', 'FE2', 'PIEmFMO3', \
       'FE3', 'mTDS', 'numbtors', 'HIEmligand', 'HIEligandE', 'logP', 'dEpligand']
label = 'Gexp'
y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
print_and_plot(featlist, y, ypred, regr, mse, r2)

msedict = {}
r2dict = {}

set1 = ['DEmFMO2', 'F2LE', 'PIEmFMO2', 'FE2', 'PIEmFMO3', \
       'FE3']
set2 = ['mTDS', 'numbtors']
set3 = ['HIEmligand', 'HIEligandE', 'logP']
f4 = 'dEpligand'
f5 = 'logP'

bestr2 = -1000.0
bestformula = None
bestmodel = None
bestmse = None
bestypred = None
besty = None

sbestr2 = -1000.0
sbestformula = []
sbestmodel = None
sbestmse = None
sbestypred = None
sbesty = None

for f1 in set1:
    for f2 in set2:
        for f3 in set3:
            featlist = []
            if f1 != 'DEmFMO2' and f1 != 'F2LE':
              featlist = [f1, f2, f3, f4]
            else:
              featlist = [f1, f2, f3]

            y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
            xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 

            f1sign = False
            f2sign = False
            for idx, f in enumerate(featlist):
               if idx == 0:
                  #print(f, regr.coef_ [idx])
                  f1sign =  (regr.coef_ [idx] > 0.0)
               elif idx == 1:
                  #print(f, regr.coef_ [idx] )
                  f2sign =  (regr.coef_ [idx] > 0.0)

            msedict[xlabel] = mse
            r2dict[xlabel] = r2

            if f1sign and f2sign:
               if r2 > sbestr2:
                  sbestr2 = r2
                  sbestformula = featlist
                  sbestmodel = regr
                  sbestmse = mse
                  sbestypred = ypred
                  sbesty = y

            if r2 > bestr2:
               bestr2 = r2
               bestformula = featlist
               bestmodel = regr
               bestmse = mse
               bestypred = ypred
               besty = y
            #print("%60s %10.5f %10.5f"%(xlabel, mse, r2))

print()
#print(bestformula, " R2: ", bestr2)
print_and_plot(bestformula, besty, bestypred, bestmodel, bestmse, bestr2)
print()

if sbestmodel != None:
   print()
   print_and_plot(sbestformula, sbesty, sbestypred, sbestmodel, sbestmse, sbestr2)
   print()

#fig, ax = pyplot.subplots(figsize=(20,12))
#ax.bar(r2dict.keys(), r2dict.values(), color='g')
#pyplot.xticks(rotation=30, ha='right')

In [None]:
featlist = ['DEmFMO2', 'F2LE', 'PIEmFMO2', 'FE2', 'PIEmFMO3', \
       'FE3', 'mTDS', 'numbtors', 'HIEmligand', 'HIEligandE', 'logP', 'dEpligand']
label = 'Gexp'
y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
print_and_plot(featlist, y, ypred, regr, mse, r2)

msedict = {}
r2dict = {}

set1 = ['DEmFMO2', 'F2LE', 'PIEmFMO2', 'FE2', 'PIEmFMO3', \
       'FE3']
set2 = ['mTDS', 'numbtors']
set3 = ['HIEmligand', 'HIEligandE', 'logP']
f4 = 'dEpligand'
f5 = 'logP'

bestr2 = -1000.0
bestformula = None
bestmodel = None
bestmse = None
bestypred = None
besty = None

sbestr2 = -1000.0
sbestformula = []
sbestmodel = None
sbestmse = None
sbestypred = None
sbesty = None

for f1 in set1:
    for f2 in set2:
        for f3 in set3:
            featlist = []
            if f1 != 'DEmFMO2' and f1 != 'F2LE':
              featlist = [f1, f2, f3, f4, f5]
            else:
              featlist = [f1, f2, f3, f5]

            y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
            xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 

            f1sign = False
            f2sign = False
            for idx, f in enumerate(featlist):
               if idx == 0:
                  #print(f, regr.coef_ [idx])
                  f1sign =  (regr.coef_ [idx] > 0.0)
               elif idx == 1:
                  #print(f, regr.coef_ [idx] )
                  f2sign =  (regr.coef_ [idx] > 0.0)

            msedict[xlabel] = mse
            r2dict[xlabel] = r2

            if f1sign and f2sign:
               if r2 > sbestr2:
                  sbestr2 = r2
                  sbestformula = featlist
                  sbestmodel = regr
                  sbestmse = mse
                  sbestypred = ypred
                  sbesty = y

            if r2 > bestr2:
               bestr2 = r2
               bestformula = featlist
               bestmodel = regr
               bestmse = mse
               bestypred = ypred
               besty = y
            #print("%60s %10.5f %10.5f"%(xlabel, mse, r2))

print()
#print(bestformula, " R2: ", bestr2)
print_and_plot(bestformula, besty, bestypred, bestmodel, bestmse, bestr2)
print()

if sbestmodel != None:
   print()
   print_and_plot(sbestformula, sbesty, sbestypred, sbestmodel, sbestmse, sbestr2)
   print()

#fig, ax = pyplot.subplots(figsize=(20,12))
#ax.bar(r2dict.keys(), r2dict.values(), color='g')
#pyplot.xticks(rotation=30, ha='right')

In [None]:
featlist = ['FE2', 'dEpligand', 'numbtors', 'HIEmligand', 'logP']
label = 'Gexp'
y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
print_and_plot(featlist, y, ypred, regr, mse, r2)

print(y, ypred)

In [None]:
# calculate the correlation matrix
corr = df2.corr()

cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '100px', 'font-size': '12pt'})\
    .set_caption("Correlation")\
    .set_precision(2)\
    .set_table_styles(magnify())

In [None]:
featlist = ['DG', 'LE', 'HIEmligand', 'logP']
label = 'Gexp'
y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df2)
print_and_plot(y, ypred, regr, mse, r2)

msedict = {}
r2dict = {}

set1 = ['DG', 'LE']
set2 = ['HIEmligand', 'logP']

for f1 in set1:
    for f2 in set2:
        featlist = [f1, f2]
        y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df2)
        xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 
        msedict[xlabel] = mse
        r2dict[xlabel] = r2
        #print("%60s %10.5f %10.5f"%(xlabel, mse, r2))

fullfeatlis = [['DG', 'HIEmligand', 'logP'], \
               ['LE', 'HIEmligand', 'logP']]

for featlist in fullfeatlis:
    y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df2)
    xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 
    msedict[xlabel] = mse
    r2dict[xlabel] = r2

fig, ax = pyplot.subplots(figsize=(20,12))
ax.bar(r2dict.keys(), r2dict.values(), color='g')
pyplot.xticks(rotation=30, ha='right')