In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.cross_validation import KFold, StratifiedKFold

In [4]:
from scipy.stats import pearsonr, spearmanr, kendalltau

In [5]:
import oddt
from oddt.scoring.functions import rfscore
from oddt.scoring import ensemble_model, scorer
from oddt.scoring.models.regressors import randomforest, svm, neuralnetwork
from oddt.metrics import enrichment_factor, roc_auc, roc_log_auc
from sklearn.metrics import accuracy_score, precision_score, mean_squared_error, recall_score
from sklearn.utils import compute_sample_weight

In [6]:
# from cluster_helper import cluster
# cluster.dill = dill

In [7]:
# from cluster_helper.cluster import cluster_view

In [8]:
dude_ids = ['aa2ar', 'abl1', 'ace', 'aces', 'ada', 'ada17', 'adrb1', 'adrb2', 'akt1', 'akt2', 'aldr', 'ampc', 'andr', 'aofb', 'bace1', 'braf', 'cah2', 'casp3', 'cdk2', 'comt', 'cp2c9', 'cp3a4', 'csf1r', 'cxcr4', 'def', 'dhi1', 'dpp4', 'drd3', 'dyr', 'egfr', 'esr1', 'esr2', 'fa10', 'fa7', 'fabp4', 'fak1', 'fgfr1', 'fkb1a', 'fnta', 'fpps', 'gcr', 'glcm', 'gria2', 'grik1', 'hdac2', 'hdac8', 'hivint', 'hivpr', 'hivrt', 'hmdh', 'hs90a', 'hxk4', 'igf1r', 'inha', 'ital', 'jak2', 'kif11', 'kit', 'kith', 'kpcb', 'lck', 'lkha4', 'mapk2', 'mcr', 'met', 'mk01', 'mk10', 'mk14', 'mmp13', 'mp2k1', 'nos1', 'nram', 'pa2ga', 'parp1', 'pde5a', 'pgh1', 'pgh2', 'plk1', 'pnph', 'ppara', 'ppard', 'pparg', 'prgr', 'ptn1', 'pur2', 'pygm', 'pyrd', 'reni', 'rock1', 'rxra', 'sahh', 'src', 'tgfr1', 'thb', 'thrb', 'try1', 'tryb1', 'tysy', 'urok', 'vgfr2', 'wee1', 'xiap']
#dude_ids = ['lck', 'src', 'ada17', 'hivpr', 'mk14', 'mmp13', 'aa2ar', 'bace1', 'pparg', 'parp1', 'ace', 'thrb', 'cdk2', 'esr1', 'esr2', 'vgfr2', 'fnta', 'drd3', 'csf1r', 'dhi1', 'casp3', 'gria2', 'kit', 'dyr', 'braf', 'tryb1', 'hdac8', 'aldr', 'akt1', 'ital', 'kpcb', 'tysy', 'ppard', 'hivint', 'ppara', 'urok', 'wee1', 'reni', 'grik1', 'aces', 'fa10', 'dpp4', 'adrb2', 'jak2', 'hivrt', 'fkb1a', 'cah2', 'kif11', 'try1', 'adrb1', 'akt2', 'rock1', 'pa2ga', 'pygm', 'mapk2', 'fa7', 'tgfr1', 'mk10', 'fak1', 'gcr', 'hdac2', 'prgr', 'ptn1', 'nram', 'abl1', 'hs90a', 'egfr', 'hxk4', 'mk01', 'cxcr4', 'lkha4', 'ada', 'pur2', 'pnph', 'andr', 'rxra', 'fpps', 'cp3a4', 'met', 'ampc', 'mp2k1', 'pyrd', 'pgh1', 'kith', 'thb', 'comt', 'cp2c9', 'aofb', 'fabp4', 'mcr', 'inha', 'pgh2', 'def', 'xiap', 'glcm', 'pde5a', 'nos1', 'sahh', 'hmdh', 'igf1r', 'plk1', 'fgfr1']

In [9]:
np.__version__

'1.10.4'

In [None]:
%%time
out = []

n_folds = 5
for engine, v in [(e, v) for e in ['dude', 'dock', 'vina'] for v in [1,2,3]]:
# for engine, v in [(e, v) for e in ['vina'] for v in [3]]:
    
    if v == 1:
        col_range = range(1,37)
        np_type = np.uint16
    elif v == 2:
        col_range = range(1,217)
        np_type = np.uint16
    elif v == 3:
        col_range = range(1,43)   
        np_type = np.float16
    
    act_cutoff = 6.
    inactive_cutoff = 5.95
    
    train = []
    test = []
    for n in range(n_folds):
        train.append([])
        test.append([])
        
    for dude_id in dude_ids:
        # trap errors on reading
        try:
            actives_full = pd.read_csv('head1_full/%s/%s/%s_desc_v%i.csv.gz' % (dude_id, engine, 'actives', v), dtype={i: np_type for i in col_range})
            decoys_full = pd.read_csv('head1_full/%s/%s/%s_desc_v%i.csv.gz' % (dude_id, engine, 'decoys', v), dtype={i: np_type for i in col_range})
        except:
            continue

        #trap empty ones
        if len(actives_full) == 0 or len(decoys_full) == 0:
            continue

        decoys_full['act'] = inactive_cutoff if act_cutoff > 10 else 10**(9-inactive_cutoff)

        # generate one big table for dude_id
        data = pd.concat((actives_full, decoys_full))

        # normalize
        if act_cutoff >= 10:
            data['act'] = np.clip(data['act'], 1e-9, inactive_cutoff)
        else:
            data['act'] = np.clip(-np.log10(np.clip(data['act'], 1e-9, 1e9) * 1e-9), 0 , 15)
        # binary
        data['act_bin'] = data['act'] < act_cutoff if act_cutoff > 10 else data['act'] > act_cutoff# 1e-6 # close to zero
        
        # manual CV, use Stratified for even distribution
        oddt.random_seed(0)
        cv_split = StratifiedKFold(data['act_bin'], n_folds=n_folds, shuffle=True)
        for n, (cv_train, cv_test) in enumerate(cv_split):
            if len(train[n]) == 0:
                train[n] = data.iloc[cv_train]
            else:
                train[n] = pd.concat((train[n], data.iloc[cv_train]))

            if len(test[n]) == 0:
                test[n] = data.iloc[cv_test]
            else:
                test[n] = pd.concat((test[n], data.iloc[cv_test]))
    
    if v == 2:
        mtry = 100
    else:
        mtry = 15
    
    rfs = []
    for n in range(n_folds):
        oddt.random_seed(0)
        # Random Forest
        rfs.append(randomforest(n_estimators=500, n_jobs=-1, verbose=1,
                                max_features=mtry, 
                                oob_score=True,
                                bootstrap = True,
                                random_state = 0,
                               ))
        # non-parallel
        rfs[n].fit(train[n][col_range], 
                   train[n]['act'],
                   sample_weight=compute_sample_weight('balanced', train[n]['act_bin']))
    


    test_dfs = []
    #print 'engine: %s\tv: %i\t head: %i' % (engine, v, head)
    for n in range(n_folds):
        rfs[n].verbose = 0
        test[n]['pred'] = rfs[n].predict(test[n][col_range])
        test[n]['pred_bin'] = test[n]['pred']  < act_cutoff if act_cutoff > 10 else test[n]['pred'] > act_cutoff
        
        test_df = test[n].sort_values('pred', ascending=False)# log is descending                                        
        test_dfs.append(test_df)
        
        d = {'engine': engine,
             'v': v,
             'roc_auc': roc_auc(test_df['act_bin'], test_df['pred_bin'], ascending_score=False),# Binary is descending
             
             
             'n_1perc': len(test_df['act_bin'])/100,
             'n_100perc': len(test_df['act_bin']),
             'a_1perc': test_df['act_bin'][:len(test_df['act_bin'])/100].sum(),
             'a_100perc': test_df['act_bin'].sum(),
             'hitrate_1perc': float(test_df['act_bin'][:len(test_df['act_bin'])/100].sum())/float(len(test_df['act_bin'])/100),
             'hitrate_100perc': float(test_df['act_bin'].sum())/float(len(test_df['act_bin'])),
             
             'ef1_perc': enrichment_factor(test_df['act_bin'], test_df['pred_bin'], kind='percentage'),
             'ef0.1': enrichment_factor(test_df['act_bin'], test_df['pred_bin'], percentage=0.1),
             'ef1': enrichment_factor(test_df['act_bin'], test_df['pred_bin']),
             'ef2': enrichment_factor(test_df['act_bin'], test_df['pred_bin'], percentage=2),
             'ef5': enrichment_factor(test_df['act_bin'], test_df['pred_bin'], percentage=5),
             'ef10': enrichment_factor(test_df['act_bin'], test_df['pred_bin'], percentage=10),
             
             'rp': pearsonr(test_df['act'], test_df['pred'])[0],
             'rs': spearmanr(test_df['act'], test_df['pred'])[0],
             'rk': kendalltau(test_df['act'], test_df['pred'])[0],
             
             'rp_active': pearsonr(test_df['act'][test_df['act_bin']], test_df['pred'][test_df['act_bin']])[0],
             'rs_active': spearmanr(test_df['act'][test_df['act_bin']], test_df['pred'][test_df['act_bin']])[0],
             'rk_active': kendalltau(test_df['act'][test_df['act_bin']], test_df['pred'][test_df['act_bin']])[0],
             
             'rp_inactive': pearsonr(test_df['act'][~test_df['act_bin']], test_df['pred'][~test_df['act_bin']])[0],
             'rs_inactive': spearmanr(test_df['act'][~test_df['act_bin']], test_df['pred'][~test_df['act_bin']])[0],
             'rk_inactive': kendalltau(test_df['act'][~test_df['act_bin']], test_df['pred'][~test_df['act_bin']])[0],
             
             'mse': mean_squared_error(test_df['act'], test_df['pred']),
             'roc_log_auc': roc_log_auc(test_df['act_bin'], test_df['pred_bin'], ascending_score=False),# Binary is descending
             'precision': precision_score(test_df['act_bin'], test_df['pred_bin']),
             'accuracy': accuracy_score(test_df['act_bin'], test_df['pred_bin']),
             'recall': recall_score(test_df['act_bin'], test_df['pred_bin']),
            }
        print d
        out.append(d)
    pd.concat(test_dfs)[['act', 'act_bin', 'pred', 'pred_bin', 'dude_id']].to_csv('dude_pred_horizontal_v%i_e%s.csv' % (v,engine), index=False)
    
pd.DataFrame(out).to_csv('dude_horizontal.csv')

[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.8min finished
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.9min finished
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.9min finished
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.9min finished
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.8min finished


{'rs_inactive': 0.0091554495961674275, 'ef10': 6.3915929203539825, 'rk_inactive': 0.0074772079991085073, 'ef1_perc': 0.4772808586762075, 'a_100perc': 4520, 'rk_active': 0.17389351956967933, 'n_100perc': 279460, 'rp': 0.45843816511978136, 'rs': 0.15940183370490155, 'rp_active': 0.28115104446562134, 'mse': 0.053148846793100751, 'n_1perc': 2794, 'rk': 0.12979719052056218, 'engine': 'dude', 'rs_active': 0.25547424280087039, 'a_1perc': 1334, 'precision': 0.078103377115249215, 'roc_auc': 0.78680776529843033, 'roc_log_auc': 0.24528300026119443, 'ef1': 29.509050611870126, 'ef2': 19.610025013060618, 'ef5': 10.504424778761063, 'hitrate_100perc': 0.016174049953481713, 'recall': 0.71172566371681412, 'rp_inactive': 0.014782723134440179, 'hitrate_1perc': 0.47745168217609163, 'v': 1, 'ef0.1': 58.073625158027816, 'accuracy': 0.85946110355685967}
{'rs_inactive': 0.008091784104414225, 'ef10': 6.684217305447214, 'rk_inactive': 0.006607323254506592, 'ef1_perc': 0.4930232558139535, 'a_100perc': 4503, 'rk_a

[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 14.0min finished
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 14.0min finished
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 13.9min finished
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 13.8min finished
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 13.8min finished


{'rs_inactive': 0.0065242341164906473, 'ef10': 7.303097345132745, 'rk_inactive': 0.0053273513764506494, 'ef1_perc': 0.6110912343470483, 'a_100perc': 4520, 'rk_active': 0.1892252300622734, 'n_100perc': 279460, 'rp': 0.53647519268890853, 'rs': 0.17469654462459136, 'rp_active': 0.31446850193051895, 'mse': 0.050484789980424369, 'n_1perc': 2794, 'rk': 0.14226074932094074, 'engine': 'dude', 'rs_active': 0.27783650314260588, 'a_1perc': 1708, 'precision': 0.084347620175230884, 'roc_auc': 0.82370575140770863, 'roc_log_auc': 0.25386963190265771, 'ef1': 37.78220273243941, 'ef2': 24.808395997910303, 'ef5': 12.650442477876107, 'hitrate_100perc': 0.016174049953481713, 'recall': 0.78805309734513274, 'rp_inactive': 0.018395105697108491, 'hitrate_1perc': 0.6113099498926271, 'v': 2, 'ef0.1': 61.60662136536031, 'accuracy': 0.85820510985471987}
{'rs_inactive': 0.0087286989840496872, 'ef10': 7.479217237457214, 'rk_inactive': 0.0071270619407165512, 'ef1_perc': 0.6268336314847943, 'a_100perc': 4503, 'rk_acti

[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.6min finished
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.5min finished


In [None]:
pd.DataFrame(out).groupby(['engine', 'v']).mean()