# Train and save final model

Run all cells to train model as described below and score all 14,707 genes

## Final model details:
- Train using all 70 curated genes as positives, random set of 200 held out as negative set

- Random undersampling to achieve 1:2 positive to negative ratio

- Gene-module cosine similarity scores as features

- Feature selection to pick top 64 modules based on ANOVA F-value

In [1]:
from refined_model import *
# M = Modules()

In [2]:
# train and save model
final_model = Nash_Model(save_path='../results/final_model_svc', 
                    sample=RandomUnderSampler(sampling_strategy=.5, random_state=0),
                    use_modules=True,
                    feat_sel=True)
final_model.train_all_curated()

# score all genes and save
final_model.score_all_genes()

## Drug scores

In [86]:
scores = pd.read_csv('../results/all_gene_scores.csv', index_col=0)

In [87]:
drug_modules = pd.read_csv('../data/gene_maps/drug.csv')
drugs = set(drug_modules['module'])
drug_list = drug_modules.set_index('gene')

In [88]:
drug_targets_scored = drug_list.join(scores, how='left').dropna()
drug_targets_scored.to_csv('../results/drug_targets_scored.csv')

In [89]:
# load svensson 909
svensson_909 = pd.read_csv('../data/svensson_909.csv').dropna()
svensson_909.index = svensson_909['Gene'].str.upper()


In [90]:

scores_fc = scores.join(svensson_909,  how='inner') # 607 overlap
scores_fc = scores_fc[scores_fc.iloc[:,5] < 10]
np.corrcoef(scores_fc['1.00'], np.log(scores_fc.iloc[:,5]))

KeyError: '1.00'

In [91]:
scores['In Svensson 909'] = [int(g in list(svensson_909['Gene'])) for g in scores.index]
# scores.sort_values(1, ascending=False).to_csv('gene_scores.csv')

In [92]:
np.sum(sven_data_filt['high Log2 Fold Change'] > 2)

NameError: name 'sven_data_filt' is not defined

In [96]:
scores

Unnamed: 0_level_0,1,known,In Svensson 909
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PTGS2,0.981269,0,0
HMOX1,0.976353,0,0
NFE2L2,0.968064,1,0
NOS2,0.965863,0,0
POR,0.965423,0,0
...,...,...,...
CAND2,0.002541,0,0
RSPH9,0.002440,0,0
LRRC6,0.002370,0,0
RSPH6A,0.002367,0,0


In [97]:
# get roc thresholds based on svensson genes
all_genes = list(set(scores.index) - set(nash_svc.M.curated_genes) - set(nash_svc.M.befree_genes))
y_true = [int(g in nash_svc.M.sven_genes) for g in all_genes]
y_pred = list(scores.loc[all_genes, '1'])
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
roc_sven = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'threshold':thresholds})
roc_sven.to_csv('roc_thresholds_118sven.csv', index=None)

In [98]:
roc_sven

Unnamed: 0,fpr,tpr,threshold
0,0.000000,0.000000,1.981269
1,0.000070,0.000000,0.981269
2,0.001689,0.000000,0.938283
3,0.001689,0.008475,0.938040
4,0.001759,0.008475,0.936882
...,...,...,...
228,0.906347,0.983051,0.020328
229,0.906347,0.991525,0.020323
230,0.910639,0.991525,0.019851
231,0.910639,1.000000,0.019831


Unnamed: 0.1,Unnamed: 0,0
0,0,CTNNBL1
1,1,SAMHD1
2,2,PTAR1
3,3,DNTTIP1
4,4,UBIAD1
...,...,...
195,195,PRR9
196,196,DNAJB8
197,197,GSTA2
198,198,RHPN1


In [73]:
set(pd.read_csv('tester1/neg_genes.csv')['0']) - set(pd.read_csv('tester2/neg_genes.csv')['0'])

set()

In [75]:
set(pd.read_csv('tester1/pos_genes')['0']) - set(pd.read_csv('tester2/pos_genes')['0'])

set()

In [80]:
pg = set(pd.read_csv('tester1/pos_genes')['0'])
ng = set(pd.read_csv('tester1/neg_genes.csv')['0'])

In [76]:
f = pd.read_csv('tester1/initial.csv', index_col=0)
g = pd.read_csv('tester2/initial.csv', index_col=0)

a = pd.read_csv('tester1/train.csv', index_col=0)
b = pd.read_csv('tester2/train.csv', index_col=0)

c = pd.read_csv('tester1/fs.csv',index_col=0)
d = pd.read_csv('tester2/fs.csv', index_col=0)

In [77]:
f.equals(g)

True

In [78]:
set(a.index) - set(b.index)

set()

In [79]:
a.equals(b)

False

In [83]:
a.loc[list(pg) + list(ng)]

Unnamed: 0_level_0,ADIPOGENESIS,ALLOGRAFT_REJECTION,ANDROGEN_RESPONSE,ANGIOGENESIS,APICAL_JUNCTION,APICAL_SURFACE,APOPTOSIS,BILE_ACID_METABOLISM,CHOLESTEROL_HOMEOSTASIS,COAGULATION,...,urea_cycle,valine_leucine_and_isoleucine_metabolism,vitamin_a_metabolism,vitamin_b12_metabolism,vitamin_b2_metabolism,vitamin_b6_metabolism,vitamin_c_metabolism,vitamin_d_metabolism,vitamin_e_metabolism,xenobiotics_metabolism
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT1A,0.638124,0.074032,0.546917,0.225074,-0.018202,0.293098,0.312357,0.769378,0.762819,0.167644,...,0.327098,0.603101,0.360086,0.343108,0.119265,0.361949,0.330484,0.616571,0.701247,0.415088
TRIB3,0.496950,0.204305,0.628626,0.252077,0.245790,0.343529,0.371258,0.488844,0.703505,0.201095,...,0.194629,0.404846,0.259249,0.166188,0.020519,0.260841,0.214467,0.530068,0.457082,0.478351
CYP17A1,0.327990,-0.015306,0.394322,-0.007150,-0.026606,0.048858,0.078436,0.487914,0.551646,0.040801,...,0.113179,0.449445,0.596715,0.292640,-0.060755,0.483407,0.311143,0.644602,0.500899,0.829585
ADH1A,0.547281,0.006771,0.364215,0.061141,0.015668,0.209684,-0.039076,0.673422,0.573842,0.080028,...,0.353447,0.787281,0.889081,0.481717,0.166528,0.745825,0.872030,0.625506,0.715842,0.689519
RAG2,0.576241,0.209355,0.513710,0.184173,0.248358,0.263374,0.558442,0.142628,0.369979,0.158196,...,0.307467,0.348865,-0.003475,0.245437,0.216907,0.169881,0.211526,0.120868,0.173437,0.182740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FPR2,0.098990,0.343000,-0.142509,0.411302,0.222248,0.443735,-0.025832,0.008350,0.017403,0.387177,...,-0.090840,-0.076739,0.074752,-0.221555,0.051254,0.054817,0.049466,-0.051573,0.045523,0.031101
ZNF232,0.412187,0.230848,0.427740,0.141305,0.272414,0.222914,0.217249,0.305139,0.420851,0.150995,...,0.256959,0.478449,0.363587,0.512810,0.231945,0.379719,0.463833,0.311713,0.259372,0.223250
RHOB,-0.022909,0.057007,-0.087626,0.136955,0.399799,0.080115,0.124519,-0.261896,-0.199724,0.089538,...,-0.040571,-0.163455,-0.127206,-0.195141,-0.061198,0.016276,-0.054342,-0.245947,-0.140639,0.011176
SRSF10,-0.047244,-0.407425,-0.049352,-0.196077,-0.244607,-0.155858,-0.420031,0.009215,0.101278,-0.144853,...,0.226392,-0.030101,0.105742,0.003630,0.054283,-0.020474,-0.051344,0.137331,-0.012461,0.132764


In [72]:
b

Unnamed: 0_level_0,ADIPOGENESIS,ALLOGRAFT_REJECTION,ANDROGEN_RESPONSE,ANGIOGENESIS,APICAL_JUNCTION,APICAL_SURFACE,APOPTOSIS,BILE_ACID_METABOLISM,CHOLESTEROL_HOMEOSTASIS,COAGULATION,...,urea_cycle,valine_leucine_and_isoleucine_metabolism,vitamin_a_metabolism,vitamin_b12_metabolism,vitamin_b2_metabolism,vitamin_b6_metabolism,vitamin_c_metabolism,vitamin_d_metabolism,vitamin_e_metabolism,xenobiotics_metabolism
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABCB4,0.731952,0.033840,0.727006,0.294341,0.147038,0.437815,0.259818,0.767907,0.854127,0.267178,...,0.302068,0.686966,0.594484,0.395372,0.212043,0.499684,0.480408,0.707322,0.746391,0.617182
ABCC2,0.249302,0.013554,0.380976,0.171583,0.078824,0.153126,0.083886,0.380277,0.517364,0.221974,...,0.056145,0.337535,0.483824,0.234132,0.039070,0.265698,0.227934,0.615818,0.319204,0.470687
ACE,0.472890,0.274766,0.355407,0.551605,0.327033,0.415009,0.278939,0.340750,0.515704,0.601514,...,0.218363,0.254618,0.132055,0.065969,0.039273,0.135991,0.161949,0.522494,0.254532,0.221785
ADH1A,0.547281,0.006771,0.364215,0.061141,0.015668,0.209684,-0.039076,0.673422,0.573842,0.080028,...,0.353447,0.787281,0.889081,0.481717,0.166528,0.745825,0.872030,0.625506,0.715842,0.689519
ADH1B,0.484780,0.008208,0.342685,0.024369,-0.011918,0.150956,-0.024183,0.660618,0.546537,0.036280,...,0.271853,0.751962,0.915437,0.448882,0.095612,0.755215,0.810511,0.639659,0.718835,0.764149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ESRRG,0.590054,0.142703,0.677057,0.190793,0.161355,0.305638,0.309037,0.503149,0.667857,0.163430,...,0.367516,0.447393,0.265891,0.345262,0.122304,0.183450,0.254814,0.500382,0.419042,0.342648
TWF1,0.033712,-0.226652,0.152331,-0.382752,-0.216514,-0.299574,-0.093160,0.053864,0.002067,-0.313053,...,0.447833,0.175173,0.000868,0.342795,0.061704,0.121173,0.201307,-0.060668,0.082636,0.035557
BCAS4,0.135454,-0.106705,0.017283,-0.149982,-0.044902,0.082617,-0.251419,0.123907,0.082530,-0.051507,...,0.261993,0.080939,0.060541,-0.052670,0.039904,0.091257,0.180724,0.094706,0.062182,-0.150438
TAAR6,0.023055,0.002157,-0.095980,-0.102393,0.014711,-0.035022,-0.240271,0.108648,0.031535,-0.086986,...,0.114534,0.100913,0.168307,0.182354,-0.140444,0.028049,0.235402,0.148321,0.076207,0.086718
