In [39]:
from refined_model import *
from modules import *
import sys

# 1) Do model benchmarking: 
## Benchmark model choices:
10 fold Cross validation and evaluation on befree/svensson sets with model choices:

    1) Features: gene embeddings vs module scores vs module scores with feature selection
    2) Sampling: SMOTE vs 1:2 undersampling
    3) Model type: linear svc
   


In [4]:
features = ['embeddings', 'mod_scores', 'fs_mod_scores']
sampling = [('under', RandomUnderSampler(sampling_strategy=.5, random_state=0)), ('over', SMOTE(random_state=0))]
model_type = ['SVC']

In [23]:
path = '../results/benchmarking'

results_summary = {}
for mod in model_type:
    for feat in features:
        for samp_label, sample in sampling:
            new_path = '/'.join([path, feat, samp_label, mod])
            
            # do benchmarking of model using all 70 genes
            final_model = Nash_Model(save_path=new_path, 
                                sample=sample,
                                use_modules=('mod' in feat),
                                feat_sel=('fs' in feat),
                                model_type = mod)
            final_model.train_all_curated(bench=True)
            
            results_summary['_'.join([mod, feat, samp_label])] = pd.read_csv(new_path + '/' + 'benchmarking output.csv',
                                                                             index_col=0, 
                                                                             header=1)
            

In [28]:
labels = np.repeat(list(results_summary.keys()), 2)
full_summary = pd.concat(results_summary.values()).reset_index()
full_summary.index = labels

In [31]:
"""
Full summary:
    index = model configuration
    label = AUROC or AP score
    befree = score from training on all 70 and testing on befreee
    sven = score from training on all 70 and testing on svensson set
    1-10 = results from model cross validation
"""
full_summary.to_csv('../results/benchmarking/full_summary.csv')
full_summary

Unnamed: 0,label,befree,sven,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0
SVC_embeddings_under,roc,0.753896,0.778771,0.823925,0.662319,0.840114,0.758438,0.938003,0.832976,0.922606,0.879882,0.958601,0.92903
SVC_embeddings_under,ap,0.833426,0.733746,0.034257,0.029465,0.154617,0.115101,0.042375,0.038463,0.192395,0.060435,0.117,0.032291
SVC_embeddings_over,roc,0.727776,0.738729,0.907989,0.833503,0.7824,0.946773,0.843479,0.899256,0.953809,0.838177,0.946671,0.844389
SVC_embeddings_over,ap,0.831942,0.672706,0.035793,0.047778,0.089668,0.267157,0.0794,0.062623,0.174568,0.10601,0.155969,0.02456
SVC_mod_scores_under,roc,0.815049,0.825085,0.91074,0.833299,0.909147,0.93678,0.866626,0.914857,0.963903,0.890384,0.939431,0.928554
SVC_mod_scores_under,ap,0.859114,0.72454,0.074809,0.025108,0.155655,0.420924,0.029657,0.050438,0.186497,0.033222,0.117562,0.074548
SVC_mod_scores_over,roc,0.79543,0.795275,0.922458,0.866619,0.867544,0.928215,0.902825,0.92954,0.967166,0.851942,0.938105,0.890276
SVC_mod_scores_over,ap,0.862097,0.697105,0.192239,0.028279,0.275928,0.185671,0.041472,0.048988,0.467563,0.037677,0.120872,0.037936
SVC_fs_mod_scores_under,roc,0.815747,0.833856,0.885674,0.860404,0.908025,0.94973,0.864994,0.930356,0.96319,0.885184,0.920465,0.945316
SVC_fs_mod_scores_under,ap,0.872733,0.739191,0.050245,0.022703,0.222032,0.493133,0.031949,0.058809,0.17594,0.032013,0.120278,0.063434


## Benchmark Gene cutoffs

Benchmark different cutoffs of the befree and svensson genes using the final model (linear svc, undersampling, module scores with feature selection, 

In [40]:
sys.path.append(r'../../scripts') # for unpickling
nash_svc = pkl.load(open('../results/final_model_svc/nash_model_trained.pkl', 'rb'))
feature_selector = nash_svc.skb
model = nash_svc.clf
nash_svc.neg_test_genes
mods = Modules()


### Different logfc scores from svensson genes

In [48]:
results = {}
for threshold in [0, 1.5, 2, 2.5, 3]:
    pos_genes = mods.load_svensson_genes(threshold)
    test_X, test_y = nash_svc.format_input(pos_genes, nash_svc.neg_test_genes)
    roc, ap = nash_svc.test(test_X, test_y)
    results[threshold] = [roc, ap]
    
pd.DataFrame(results, index=['roc', 'ap']).to_csv('../results/benchmarking/svensson_cutoffs.csv')
pd.DataFrame(results, index=['roc', 'ap'])

Unnamed: 0,0.0,1.5,2.0,2.5,3.0
roc,0.833856,0.837202,0.877785,0.897818,0.858542
ap,0.739191,0.733836,0.730715,0.693528,0.468934


### Different scores for Befree genes 
Note: score doesn't really seem to matter

In [47]:
results = {}
for threshold in [-1, .01, .02, .03, .04]:
    pos_genes = mods.load_befree_genes(threshold)
    test_X, test_y = nash_svc.format_input(pos_genes, nash_svc.neg_test_genes)
    roc, ap = nash_svc.test(test_X, test_y)
    results[threshold] = [roc, ap]
    
pd.DataFrame(results, index=['roc', 'ap']).to_csv('../results/benchmarking/befree_cutoffs.csv')
pd.DataFrame(results, index=['roc', 'ap'])

Unnamed: 0,-1.00,0.01,0.02,0.03,0.04
roc,0.815747,0.81487,0.844722,0.800588,0.787083
ap,0.872733,0.701529,0.611414,0.495746,0.435577


# 2) Train and save final model

Run all cells below to train model as described below and score all 14,707 genes

## Final model details:
- Linear SVC

- Train using all 70 curated genes as positives, random set of 200 held out as negative set

- Random undersampling to achieve 1:2 positive to negative ratio

- Gene-module cosine similarity scores as features

- Feature selection to pick top 64 modules based on ANOVA F-value

In [2]:
# instantiate model object using configurations described above
final_model = Nash_Model(save_path='../results/final_model_svc', 
                    sample=RandomUnderSampler(sampling_strategy=.5, random_state=0),
                    use_modules=True,
                    feat_sel=True,
                    model_type='SVC')

# train the model using the set of all 70 curated genes as positives
final_model.train_all_curated()

# score all genes and save
final_model.score_all_genes()

In [5]:
# show gene scores
pd.read_csv('../results/final_model_svc/all_gene_scores.csv')

Unnamed: 0,0,score,known
0,PTGS2,0.981269,0
1,HMOX1,0.976353,0
2,NFE2L2,0.968064,1
3,NOS2,0.965863,0
4,POR,0.965423,0
...,...,...,...
14702,CAND2,0.002541,0
14703,RSPH9,0.002440,0
14704,LRRC6,0.002370,0
14705,RSPH6A,0.002367,0
