# Drug-target interaction prediction using direct & inderect correlation features

In [1]:
import pandas as pd
import numpy as np
import scipy
import itertools
from support_functions import log_progress
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

In [39]:
# load the data
X_std_df = pd.DataFrame.from_csv('features/X_std.csv')
y_df = pd.DataFrame.from_csv('features/y.csv')
metadata_df = pd.DataFrame.from_csv('features/metadata.csv')

In [36]:
# format for sklearn
X_std = X_std_df.values
y = y_df.label.values

___
## Logistic regression

In [121]:
LogisticRegression?

In [154]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=10.0, random_state=1, class_weight="balanced", 
                        tol=1e-7, solver='newton-cg', max_iter=1000)
lr.fit(X_std, y)

LogisticRegression(C=10.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=1,
          solver='newton-cg', tol=1e-07, verbose=0, warm_start=False)

In [155]:
lr.n_iter_

array([19], dtype=int32)

In [156]:
metadata_df['prediction'] = lr.predict(X_std)

false_interactions = metadata_df[metadata_df.label==-1]
false_accuracy = (false_interactions[false_interactions.prediction==-1].shape[0] /
                  false_interactions.shape[0])

true_interactions = metadata_df[metadata_df.label==1]
true_accuracy = (true_interactions[true_interactions.prediction==1].shape[0] /
                  true_interactions.shape[0])


print('Accuracy on False interactions: {:.2f}'.format(false_accuracy))
print('Accuracy on True interactions: {:.2f}'.format(true_accuracy))

Accuracy on False interactions: 0.69
Accuracy on True interactions: 0.70


In [157]:
# evaluate top-100 accuracy
metadata_df['prob'] = lr.predict_proba(X_std)[:,1]

In [158]:
unique_cpds = metadata_df.cpd.unique()
top_100_results = pd.Series(index=unique_cpds)

for c in unique_cpds:
    cpd_pairs = metadata_df.query('cpd == "{}"'.format(c)).copy()
    sorted_cpd_pairs = cpd_pairs.sort_values(by=['prob'], ascending=False).reset_index(drop=True)
    true_interactions = cpd_pairs.query('label == 1')
    best_rank = true_interactions.prob.idxmax()
    top_100_results[c] = best_rank

In [159]:
top_100_results

BRD-K21680192     29252.0
BRD-K81418486      2770.0
BRD-K94441233     29041.0
BRD-K09638361    208175.0
BRD-K08547377     25777.0
BRD-K92093830    101748.0
BRD-K92428153     15058.0
BRD-K54233340     13213.0
BRD-K78126613     31390.0
BRD-A45889380    158197.0
BRD-K27305650    141300.0
BRD-K31342827     32458.0
BRD-K55127134     21454.0
BRD-K56429665     26922.0
BRD-K50836978     56530.0
BRD-K12994359     36367.0
BRD-K15108141     52982.0
BRD-K19295594     24228.0
BRD-K52075040     32678.0
BRD-K56343971     44252.0
BRD-K85606544       659.0
BRD-K88510285     25652.0
BRD-K92241597    117953.0
BRD-A56592690     56752.0
BRD-K68756823     19904.0
BRD-K93034159     52991.0
BRD-K55696337     25799.0
BRD-K60038276     13801.0
BRD-K87909389       667.0
BRD-K68202742      5922.0
                   ...   
BRD-K64052750       680.0
BRD-K17953061     32489.0
BRD-K63828191     25177.0
BRD-K07572174     12327.0
BRD-K75295174      7287.0
BRD-A82371568     53012.0
BRD-A81772229     29086.0
BRD-K1250228