In [None]:
import pandas as pd
import numpy as np

xls = pd.ExcelFile('cleandataset.xlsx')

df1 = pd.read_excel(xls, 'nonxtb')
df2 = pd.read_excel(xls, 'xtb')

print(df1.columns)
print(df2.columns)

In [None]:
from sklearn import linear_model
from sklearn import metrics
from matplotlib import pyplot

def magnify():
    
    return [dict(selector="th",
                 props=[("font-size", "12pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '400px'),
                        ('font-size', '12pt')])]


def get_lr_model (featlist, label, df):

     x = df[featlist].values
     y = df[label].values

     #print(x.shape)
     #print(y.shape)
 
     # with sklearn
     regr = linear_model.LinearRegression()
     regr.fit(x, y)

     y_pred = regr.predict(x)

     """
     y_pred = []
     for idx, xval in enumerate(x):
          ypred = regr.intercept_
          for i, xi in enumerate(xval):
               ypred += xi*regr.coef_[i]
          y_pred.append(ypred)
     """

     mse = metrics.mean_squared_error (y, y_pred)
     r2 = metrics.r2_score (y, y_pred)

     return y, y_pred, regr, mse, r2

def print_and_plot(y, y_pred, regr, mse, r2):
     
     print('Intercept: \n', regr.intercept_)
     for idx, f in enumerate(featlist):
           print("Coeff %10s %10.4f "%(f,regr.coef_ [idx]))

     print("MSE: %10.5f R2: %10.5f"%(mse, r2))

     pyplot.scatter(y, y_pred)
     pyplot.xlabel("True Values")
     pyplot.ylabel("Predicted Values")
     pyplot.show()

In [None]:
import seaborn as sns
%matplotlib inline

# calculate the correlation matrix
corr = df1.corr()

cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)
corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '100px', 'font-size': '12pt'})\
    .set_caption("Correlation")\
    .set_precision(2)\
    .set_table_styles(magnify())

In [None]:

featlist = ['DEmFMO2', 'F2LE', 'PIEmFMO2', 'FE2', 'PIEmFMO3', \
       'FE3', 'mTDS', 'numbtors', 'HIEmligand', 'HIEligandE', 'logP']
label = 'Gexp'
y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
print_and_plot(y, ypred, regr, mse, r2)

msedict = {}
r2dict = {}

set1 = ['DEmFMO2', 'F2LE', 'PIEmFMO2', 'FE2', 'PIEmFMO3', \
       'FE3']
set2 = ['mTDS', 'numbtors']
set3 = ['numbtors']
set4 = ['HIEmligand', 'HIEligandE', 'logP']

for f1 in set1:
    for f2 in set2:
        for f3 in set3:
            for f4 in set4:
              featlist = [f1, f2, f3, f4]
              y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
              xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 
              msedict[xlabel] = mse
              r2dict[xlabel] = r2
              #print("%60s %10.5f %10.5f"%(xlabel, mse, r2))

fullfeatlis = [['F2LE', 'FE2', 'FE3', 'numbtors', 'HIEligandE', 'logP'], \
               ['F2LE', 'FE2', 'FE3', 'numbtors', 'HIEligandE'], \
               ['F2LE', 'FE2', 'FE3', 'numbtors', 'logP'], \
               ['F2LE', 'FE3', 'numbtors', 'HIEligandE', 'logP'], \
               ['FE3', 'numbtors', 'HIEligandE', 'logP'], \
               ['FE2', 'numbtors', 'HIEligandE', 'logP'], \
               ['DEmFMO2', 'numbtors', 'HIEligandE', 'logP'], \
               ['F2LE', 'numbtors', 'HIEligandE', 'logP'], \
               ['DEmFMO2', 'F2LE', 'PIEmFMO2', 'FE2', 'PIEmFMO3', 'FE3']]

for featlist in fullfeatlis:
    y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
    xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 
    msedict[xlabel] = mse
    r2dict[xlabel] = r2

fig, ax = pyplot.subplots(figsize=(20,12))
ax.bar(r2dict.keys(), r2dict.values(), color='g')
pyplot.xticks(rotation=30, ha='right')

In [None]:
# 3 elements one for each set 

msedict = {}
r2dict = {}

label = 'Gexp'

set1 = ['DEmFMO2', 'F2LE', 'PIEmFMO2', 'FE2', 'PIEmFMO3', \
       'FE3']
set2 = ['mTDS', 'numbtors']
set3 = ['HIEmligand', 'HIEligandE', 'logP']

hr2 = 0.0
hinter = 0.0
hcoeff = 0.0
hfeatlist = None

for f1 in set1:
    for f2 in set2:
        for f3 in set3:
            featlist = [f1, f2, f3]
            y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
            xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 
            msedict[xlabel] = mse
            r2dict[xlabel] = r2
            if (r2 > hr2):
                hr2 = r2
                hinter = regr.intercept_
                hcoeff = regr.coef_
                hfeatlist = featlist
                
            print("%60s %10.5f %10.5f"%(xlabel, mse, r2))

print(hfeatlist, hr2, hcoeff, hinter)

fig, ax = pyplot.subplots(figsize=(20,12))
ax.bar(r2dict.keys(), r2dict.values(), color='g')
pyplot.xticks(rotation=30, ha='right')

In [None]:
# 3 elements all possible combination

msedict = {}
r2dict = {}

label = 'Gexp'

set1 = ['DEmFMO2', 'F2LE', 'PIEmFMO2', 'FE2', 'PIEmFMO3', \
       'FE3', 'mTDS', 'numbtors', 'HIEmligand', 'HIEligandE', 'logP']


hr2 = 0.0
hinter = 0.0
hcoeff = 0.0
hfeatlist = None

for f1 in set1:
    for f2 in set1:
        if f1 != f2:
            for f3 in set1:
                if f3 != f1 and f3 != f2:
                    featlist = [f1, f2, f3]
                    y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df1)
                    xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 
                    msedict[xlabel] = mse
                    r2dict[xlabel] = r2
                    if (r2 > hr2):
                        hr2 = r2
                        hinter = regr.intercept_
                        hcoeff = regr.coef_
                        hfeatlist = featlist
                        
                    print("%60s %10.5f %10.5f"%(xlabel, mse, r2))

print(hfeatlist, hr2, hcoeff, hinter)

fig, ax = pyplot.subplots(figsize=(20,12))
ax.bar(r2dict.keys(), r2dict.values(), color='g')
pyplot.xticks(rotation=30, ha='right')

In [None]:
# calculate the correlation matrix
corr = df2.corr()

cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '100px', 'font-size': '12pt'})\
    .set_caption("Correlation")\
    .set_precision(2)\
    .set_table_styles(magnify())

In [None]:
featlist = ['DG', 'LE', 'HIEmligand', 'logP']
label = 'Gexp'
y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df2)
print_and_plot(y, ypred, regr, mse, r2)

msedict = {}
r2dict = {}

set1 = ['DG', 'LE']
set2 = ['HIEmligand', 'logP']

for f1 in set1:
    for f2 in set2:
        featlist = [f1, f2]
        y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df2)
        xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 
        msedict[xlabel] = mse
        r2dict[xlabel] = r2
        #print("%60s %10.5f %10.5f"%(xlabel, mse, r2))

fullfeatlis = [['DG', 'HIEmligand', 'logP'], \
               ['LE', 'HIEmligand', 'logP']]

for featlist in fullfeatlis:
    y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df2)
    xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 
    msedict[xlabel] = mse
    r2dict[xlabel] = r2

fig, ax = pyplot.subplots(figsize=(20,12))
ax.bar(r2dict.keys(), r2dict.values(), color='g')
pyplot.xticks(rotation=30, ha='right')

In [None]:
# 3 elements one for each set 

msedict = {}
r2dict = {}

label = 'Gexp'

set1 = ['DG', 'LE']
set2 = ['HIEmligand']
set3 = ['logP']


hr2 = 0.0
hinter = 0.0
hcoeff = 0.0
hfeatlist = None

for f1 in set1:
    for f2 in set2:
        for f3 in set3:
            featlist = [f1, f2, f3]
            y, ypred, regr, mse, r2 = get_lr_model (featlist, label, df2)
            xlabel = str(featlist).replace("\'", "").replace("[", "").replace("]", "") 
            msedict[xlabel] = mse
            r2dict[xlabel] = r2
            if (r2 > hr2):
                hr2 = r2
                hinter = regr.intercept_
                hcoeff = regr.coef_
                hfeatlist = featlist
                
            print("%60s %10.5f %10.5f"%(xlabel, mse, r2))

print(hfeatlist, hr2, hcoeff, hinter)

fig, ax = pyplot.subplots(figsize=(20,12))
ax.bar(r2dict.keys(), r2dict.values(), color='g')
pyplot.xticks(rotation=30, ha='right')