**Machine learning models for pi3k**

*This models are based on empirical potentials obtained with docking by means of Smina docking software.*

# Setup

ensure MatplotLib plots figures inline and prepare a function to save the figures:

In [317]:
# Clear all variables
%reset -f
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals


import numpy as np
import pandas as pd
import os


np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# Get the data

In [318]:
import pandas as pd

HOUSING_PATH = os.path.join('.', 'datasets')

def load_data(filename, housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, filename)
    return pd.DataFrame.from_csv(csv_path, sep='\t', header=None)

In [319]:
train_actives = load_data('actives_scores_25_3.txt')
train_decoys = load_data('decoys_25_3.txt')
train_actives['Active'] = 1
train_decoys['Active'] = 0

In [320]:
test_actives = load_data('scores_active.txt')
test_decoys = load_data('scores_decoys.txt')
test_decoys = test_decoys.sample(n=len(test_actives))
test_actives['Active'] = 1
test_decoys['Active'] = 0

In [321]:
def merge_data(dat1, dat2):
    new_dat = pd.concat([dat1, dat2])
    new_dat = new_dat.sample(frac=1)
    return new_dat

In [322]:
train_set = merge_data(train_actives, train_decoys)
test_set = merge_data(test_actives, test_decoys)

# Prepare the data for Machine Learning algorithms

In [323]:
def split_data_to_labels(dat):
    labels = dat["Active"].copy().as_matrix()
    new_dat = dat.drop("Active", axis=1)
    return labels, new_dat

In [324]:
test_labels, test_data = split_data_to_labels(test_set)
train_labels, train_data = split_data_to_labels(train_set)

### Transorm the data 

In [325]:
def get_transformer(train):
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.feature_selection import VarianceThreshold

    num_pipeline = Pipeline([
            ('VarianceThreshold', VarianceThreshold(threshold=0.0)),
            ('std_scaler', StandardScaler()),
        ])
    num_pipeline.fit(train)
    return num_pipeline

In [326]:
def transform_data(transformer, dat):
    dat_tr = transformer.transform(dat)
    return dat_tr

In [327]:
transformer = get_transformer(train_data)
train_set_tr = transform_data(transformer, train_data)
test_set_tr = transform_data(transformer, test_data)

# Deep neural network model

In [328]:
def get_dnn_clf(train, train_labels):
    import tensorflow as tf

    config = tf.contrib.learn.RunConfig()

    feature_cols = tf.contrib.learn.infer_real_valued_columns_from_input(train)
    dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[300,100], n_classes=2,
                                             feature_columns=feature_cols, dropout=0.45, config=config)
    dnn_clf = tf.contrib.learn.SKCompat(dnn_clf)
    dnn_clf.fit(train, train_labels, batch_size=50, steps=100000)
    
    return dnn_clf

In [329]:
def get_auc(clf, test, test_labels):
    from sklearn.metrics import roc_auc_score, log_loss, accuracy_score, f1_score
    
    y_pred = clf.predict(test)
    roc_auc = (roc_auc_score(test_labels, y_pred['probabilities'][:,1]))
    print(accuracy_score(test_labels, y_pred['classes']))
    print(f1_score(test_labels, y_pred['classes']))
    return roc_auc

In [None]:
clf = get_dnn_clf(train_set_tr, train_labels)

In [None]:
print(get_auc(clf, test_set_tr, test_labels))
print(get_auc(clf, train_set_tr, train_labels))