In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
import os
import numpy as np
from sklearn.metrics import log_loss,roc_curve,auc
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pickle

In [98]:
input_filename="training_formatted_11_18_even_smaller.txt"

np.random.seed(0)
# 10 folds
k=10

cs=pd.read_csv(input_filename,sep='\t')
features=["mean_exp","damaging_ratio","non_damaging_ratio","n_male","n_female","n_COSMIC","n_TCGA"]
#features=["mean_exp","damaging_ratio","n_TCGA","n_female","n_COSMIC"]

#shuffle training and holdout data
cs=cs.sample(frac=1,random_state=1)
cs.reset_index(inplace=True)

cs_holdout=cs.sample(frac=0.1,random_state=1)

cs_prep=cs.loc[~cs.index.isin(cs_holdout.index),:]
cs_prep.reset_index(inplace=True)
cs_holdout.reset_index(inplace=True)

cs_k_fold=np.array_split(cs_prep,k)
cs_valid=pd.DataFrame()

for fold in range(k):
    cs_train=pd.DataFrame()
    TP=0
    FP=0
    FN=0
    TN=0
    
    #iterate through fold
    for number,group in enumerate(cs_k_fold):
        if fold==number:
            cs_valid=group
        else:
            cs_train=cs_train.append(group)
        
    cs_train=cs_train.reset_index(drop=True)
    cs_valid=cs_valid.reset_index(drop=True)

    clf = RandomForestClassifier(n_jobs=2, random_state=0)

    clf.fit(cs_train[features], cs_train['sig_dep'])
    print(clf.feature_importances_)
    
    base_X=clf.predict(cs_valid[features])
    
    for i in np.arange(0,len(base_X)):
        if base_X[i]==cs_valid.loc[i,'sig_dep'] and cs_valid.loc[i,'sig_dep']==1:
            TP+=1
        elif base_X[i]==cs_valid.loc[i,'sig_dep'] and cs_valid.loc[i,'sig_dep']==0:
            TN+=1
        elif base_X[i]!=cs_valid.loc[i,'sig_dep'] and cs_valid.loc[i,'sig_dep']==1:
            FN+=1
        elif base_X[i]!=cs_valid.loc[i,'sig_dep'] and cs_valid.loc[i,'sig_dep']==0:
            FP+=1
            
    acc=(TP+TN)/len(base_X)
    
    tpr=TP/(TP+FN)
    fpr=FP/(FP+TN)
    specificity=TN/(TN+FP)
    print(["Accuracy: "+str(acc),"sensitivity: "+str(tpr),
           "specificity: "+str(specificity),"false positive rate: "+str(fpr)])
    
    #loss=log_loss(base_X,cs_valid['sig_dep'])
    #print(loss)



[0.19869312 0.11131072 0.13024673 0.1164839  0.06596101 0.16758383
 0.20972069]
['Accuracy: 0.9024390243902439', 'sensitivity: 0.2', 'specificity: 1.0', 'false positive rate: 0.0']
[0.20686588 0.12350651 0.11940814 0.09847597 0.0820642  0.1422364
 0.2274429 ]
['Accuracy: 0.9024390243902439', 'sensitivity: 0.42857142857142855', 'specificity: 1.0', 'false positive rate: 0.0']
[0.28017035 0.08562715 0.12052016 0.10554907 0.08824024 0.13990886
 0.17998418]
['Accuracy: 0.975', 'sensitivity: 0.0', 'specificity: 1.0', 'false positive rate: 0.0']
[0.23792979 0.10497551 0.12471576 0.08797842 0.07381684 0.16429711
 0.20628657]
['Accuracy: 0.85', 'sensitivity: 0.25', 'specificity: 1.0', 'false positive rate: 0.0']
[0.27681769 0.11123435 0.13751422 0.12807483 0.08191548 0.09215662
 0.17228681]
['Accuracy: 0.95', 'sensitivity: 0.8', 'specificity: 0.9714285714285714', 'false positive rate: 0.02857142857142857']
[0.26428796 0.10813498 0.10137136 0.08738724 0.07711569 0.09754034
 0.26416244]
['Accurac

In [23]:
input_filename="training_formatted_11_18.txt"

np.random.seed(0)
# 10 folds
k=10

cs=pd.read_csv(input_filename,sep='\t')
features=["mean_exp","damaging_ratio","non_damaging_ratio","n_male","n_female","n_COSMIC","n_TCGA"]

#shuffle training and holdout data
cs=cs.sample(frac=1,random_state=1)
cs.reset_index(inplace=True)

cs_holdout=cs.sample(frac=0.1,random_state=1)

cs_prep=cs.loc[~cs.index.isin(cs_holdout.index),:]
cs_prep.reset_index(inplace=True)
cs_holdout.reset_index(inplace=True)

cs_k_fold=np.array_split(cs_prep,k)
cs_valid=pd.DataFrame()

for fold in range(k):
    cs_train=pd.DataFrame()
    TP=0
    FP=0
    FN=0
    TN=0
    
    #iterate through fold
    for number,group in enumerate(cs_k_fold):
        if fold==number:
            cs_valid=group
        else:
            cs_train=cs_train.append(group)
        
    cs_train=cs_train.reset_index(drop=True)
    cs_valid=cs_valid.reset_index(drop=True)

    clf = SVC(kernel='linear')
    y=pd.factorize(cs_train['sig_dep'])[0]

    clf.fit(cs_train[features], y)
    #print(clf.feature_importances_)
    
    base_X=clf.predict(cs_valid[features])
    
    for i in np.arange(0,len(base_X)):
        if base_X[i]==cs_valid.loc[i,'sig_dep'] and cs_valid.loc[i,'sig_dep']==1:
            TP+=1
        elif base_X[i]==cs_valid.loc[i,'sig_dep'] and cs_valid.loc[i,'sig_dep']==0:
            TN+=1
        elif base_X[i]!=cs_valid.loc[i,'sig_dep'] and cs_valid.loc[i,'sig_dep']==1:
            FN+=1
        elif base_X[i]!=cs_valid.loc[i,'sig_dep'] and cs_valid.loc[i,'sig_dep']==0:
            FP+=1
            
    acc=n_match/len(base_X)
    
    tpr=TP/(TP+FN)
    fpr=FP/(FP+TN)
    specificity=TN/(TN+FP)
    print([acc,tpr,specificity,fpr])
    
    #loss=log_loss(base_X,cs_valid['sig_dep'])
    #print(loss)



[0.9972565157750343, 0.0, 1.0, 0.0]
[0.9986263736263736, 0.0, 1.0, 0.0]
[0.9986263736263736, 0.0, 1.0, 0.0]
[0.9986263736263736, 0.0, 1.0, 0.0]
[0.9986263736263736, 0.0, 1.0, 0.0]
[0.9986263736263736, 0.0, 1.0, 0.0]
[0.9986263736263736, 0.0, 1.0, 0.0]
[0.9986263736263736, 0.0, 1.0, 0.0]
[0.9986263736263736, 0.0, 1.0, 0.0]
[0.9986263736263736, 0.0, 1.0, 0.0]


In [102]:
class NNet(nn.Module):
    # use prior architectures. 
    def __init__(self): #self,n_input_features,hidden_size,n_layers
        super(NNet, self).__init__()
        self.fc1 = nn.Linear(6, 72)
        self.dropout = nn.Dropout(0.1) 
        self.fc2 = nn.Linear(72, 72)
        self.fc3 = nn.Linear(72,1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x))) 
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.sigmoid(self.fc3(x))
        return x

In [100]:
class csDataset(Dataset):
    """call stat dataset."""

    def __init__(self, df, transform=None):
        """
        Args:
            tsv_file (string): Path to the tsv file with annotations.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.cs_frame = df
        self.transform = transform

    def __len__(self):
        return len(self.cs_frame)

    def __getitem__(self, idx):
        items=self.cs_frame.loc[idx,["damaging_ratio","non_damaging_ratio","n_male","n_female","n_COSMIC","n_TCGA"]]
        #items=self.cs_frame.loc[idx,["mean_exp","n_TCGA"]]
        features = torch.FloatTensor(items) 
        target = torch.FloatTensor(pd.Series(self.cs_frame.loc[idx,'sig_dep'],dtype=np.float32).values)
        sample = {'target':target,
            'features': features}

        if self.transform:
            sample = self.transform(sample)

        return sample
        

In [103]:
input_filename="training_formatted_11_18_even_smaller.txt"
k=10
error_tol=1e-4

cs=pd.read_csv(input_filename,sep='\t')
cs=cs.sample(frac=1,random_state=1)
cs.reset_index(inplace=True)

#cs_holdout=cs.sample(frac=0.1,random_state=1)

#cs_prep=cs.loc[~cs.index.isin(cs_holdout.index),:]
#cs_prep.reset_index(inplace=True)
#cs_holdout.reset_index(inplace=True)

cs_k_fold=np.array_split(cs,k)
cs_valid=pd.DataFrame()

performance_k=[]
acc_k=[]
spec_k=[]
sens_k=[]

for fold in range(k):
    
    net = NNet()

    criterion = nn.BCELoss() 
    plt.gcf().clear()
    
    optimizer = optim.Adam(net.parameters(), lr=0.01) # maybe replace with Adam
    cs_train=pd.DataFrame()

    
    for number,group in enumerate(cs_k_fold):
        if fold==number:
            cs_valid=group
        else:
            cs_train=cs_train.append(group)
        
    cs_train=cs_train.reset_index(drop=True)
    cs_valid=cs_valid.reset_index(drop=True)
    transformed_dataset = csDataset(df=cs_train)

    cs = DataLoader(transformed_dataset, batch_size=10,
                        shuffle=True, num_workers=10)

    acc=0
    tp=0
    fp=0
    fn=0
    tn=0
    for epoch in range(20):
        running_loss = 0.0
        net.train()

        for i, data in enumerate(cs):
            targets, inputs = data['target'],data['features']

            optimizer.zero_grad()
            outputs = net(inputs)

            loss = criterion(outputs,targets)

            running_loss += loss.item()
#             if 1 in targets:
#                 print([targets,outputs,epoch,i,loss.item()])
            
            loss.backward()

            optimizer.step()
            
            for j in range(len(outputs)):
                if targets[j]==0 and outputs[j]<0.5:
                    tn+=1
                    acc+=1
                elif targets[j]==0 and outputs[j]>=0.5:
                    fp+=1
                elif targets[j]==1 and outputs[j]<=0.5:
                    fn+=1
                elif targets[j]==1 and outputs[j]>0.5:
                    tp+=1
                    acc+=1
                
            accuracy=acc/(len(outputs)*(i+1)+epoch*len(cs_train.index))
            try:
                tpr=tp/(tp+fn)
                fpr=fp/(fp+tn)
                specificity=tn/(tn+fp)
            except ZeroDivisionError:
                pass
            
            if i%5==0:
                print([epoch,i])
                print("training accuracy: "+str(accuracy))
                print("training sensitivity: "+str(tpr))
                print("training specificity: "+str(specificity))

            if loss<error_tol and 1 in targets:
                break
        else:
            continue
        break
            
            
#             plt.scatter(i+(120*epoch),loss.item(),
#                 label='training' if i == 0 else "")
#             plt.xlabel('batch')
#             plt.ylabel('loss')
#         plt.savefig("loss_"+str(fold)+".png")
        
    #torch.save(net, "fold_dev")
            
    #eval_df=pd.DataFrame(columns=["pred","target"])        
    with torch.no_grad():
        
        transformed_dataset = csDataset(df=cs_valid)

        val = DataLoader(transformed_dataset, batch_size=10,
                        shuffle=True, num_workers=10)
        net.eval()
        shifts=[]
        baseline_shifts=[]
        valid_acc=[]
        valid_spec=[]
        valid_sens=[]
        plt.gcf().clear()
        
        acc=0
        tp=0
        fp=0
        fn=0
        tn=0
        for i, data in enumerate(val,0):
            
            targets,inputs=data['target'],data['features']
            
            outputs=net(inputs)

            loss = criterion(outputs,targets)

            shifts.append(loss.item())
            
#             print([outputs,targets])
#             plt.scatter(outputs,targets)
#             plt.xlabel('predicted_af')
#             plt.ylabel('target_af')

            running_loss += loss.item()
    
            for j in range(len(outputs)):
                if targets[j]==0 and outputs[j]<0.5:
                    tn+=1
                    acc+=1
                elif targets[j]==0 and outputs[j]>0.5:
                    fp+=1
                elif targets[j]==1 and outputs[j]<0.5:
                    fn+=1
                elif targets[j]==1 and outputs[j]>0.5:
                    tp+=1
                    acc+=1
            try:        
                tpr=tp/(tp+fn)
                fpr=fp/(fp+tn)
                specificity=tn/(tn+fp)
            except ZeroDivisionError:
                pass
            accuracy=acc/(len(outputs)*(i+1))
               
            valid_acc.append(accuracy)
            valid_sens.append(tpr)
            valid_spec.append(specificity)
#         plt.scatter(np.arange(0,len(shifts)),shifts,c='b',
#                     label='validation' if i == 0 else "")
#         plt.xlabel('batch')
#         plt.ylabel('loss') 
#         plt.savefig("loss_"+str(fold)+".png")
#         plt.gcf().clear()

        performance=np.median(shifts)
        print(np.median(shifts))
        
        perform_acc=np.median(valid_acc)
        perform_sens=np.median(valid_sens)
        perform_spec=np.median(valid_spec)

        performance_k.append(performance)
        acc_k.append(perform_acc)
        sens_k.append(perform_sens)
        spec_k.append(perform_spec)
        
        print(acc_k)
        print([np.median(performance_k),np.median(acc_k),np.median(sens_k),np.median(spec_k)])
        

[0, 0]
training accuracy: 0.3
training sensitivity: 0.30606860158311344
training specificity: 0.9856817442238854
[0, 5]
training accuracy: 0.7666666666666667
training sensitivity: 0.0
training specificity: 0.8679245283018868
[0, 10]
training accuracy: 0.8090909090909091
training sensitivity: 0.0
training specificity: 0.9270833333333334
[0, 15]
training accuracy: 0.8375
training sensitivity: 0.0
training specificity: 0.950354609929078
[0, 20]
training accuracy: 0.8523809523809524
training sensitivity: 0.1111111111111111
training specificity: 0.9617486338797814
[0, 25]
training accuracy: 0.8615384615384616
training sensitivity: 0.0967741935483871
training specificity: 0.9650655021834061
[0, 30]
training accuracy: 0.867741935483871
training sensitivity: 0.08333333333333333
training specificity: 0.9708029197080292
[0, 35]
training accuracy: 0.875
training sensitivity: 0.12195121951219512
training specificity: 0.9717868338557993
[0, 40]
training accuracy: 4.341463414634147
training sensitiv

[7, 35]
training accuracy: 0.9240705734089477
training sensitivity: 0.3701492537313433
training specificity: 0.9894328989080662
[7, 40]
training accuracy: 1.025207182320442
training sensitivity: 0.3691860465116279
training specificity: 0.9895543175487466
[8, 0]
training accuracy: 0.9234345939243646
training sensitivity: 0.3691860465116279
training specificity: 0.9895905621096461
[8, 5]
training accuracy: 0.9236874236874237
training sensitivity: 0.37142857142857144
training specificity: 0.9897470950102529
[8, 10]
training accuracy: 0.9230306674684305
training sensitivity: 0.3672316384180791
training specificity: 0.9892328398384926
[8, 15]
training accuracy: 0.9235781990521327
training sensitivity: 0.3697478991596639
training specificity: 0.9890692282212653
[8, 20]
training accuracy: 0.9235259778166959
training sensitivity: 0.3674033149171271
training specificity: 0.9892297650130548
[8, 25]
training accuracy: 0.9237629459148446
training sensitivity: 0.3712737127371274
training specificit

[15, 0]
training accuracy: 0.927317880794702
training sensitivity: 0.3829457364341085
training specificity: 0.9924003707136237
[15, 5]
training accuracy: 0.9275862068965517
training sensitivity: 0.3861538461538462
training specificity: 0.9922794117647059
[15, 10]
training accuracy: 0.9271986970684039
training sensitivity: 0.38543247344461307
training specificity: 0.9923371647509579
[15, 15]
training accuracy: 0.9276252019386106
training sensitivity: 0.38484848484848483
training specificity: 0.9924050632911392
[15, 20]
training accuracy: 0.9274038461538462
training sensitivity: 0.38288288288288286
training specificity: 0.9924650161463939
[15, 25]
training accuracy: 0.9279809220985692
training sensitivity: 0.3847305389221557
training specificity: 0.9925293489861259
[15, 30]
training accuracy: 0.9280757097791799
training sensitivity: 0.3848439821693908
training specificity: 0.9925886712546321
[15, 35]
training accuracy: 0.9275430359937402
training sensitivity: 0.383601756954612
training s

[2, 30]
training accuracy: 0.9192100538599641
training sensitivity: 0.28440366972477066
training specificity: 0.9880597014925373
[2, 35]
training accuracy: 0.9209621993127147
training sensitivity: 0.30434782608695654
training specificity: 0.988560533841754
[2, 40]
training accuracy: 1.2562076749435667
training sensitivity: 0.3076923076923077
training specificity: 0.9889807162534435
[3, 0]
training accuracy: 0.9226973684210527
training sensitivity: 0.3050847457627119
training specificity: 0.9890710382513661
[3, 5]
training accuracy: 0.9249605055292259
training sensitivity: 0.3140495867768595
training specificity: 0.9895196506550218
[3, 10]
training accuracy: 0.925531914893617
training sensitivity: 0.3253968253968254
training specificity: 0.9890756302521009
[3, 15]
training accuracy: 0.9267935578330894
training sensitivity: 0.33587786259541985
training specificity: 0.9894736842105263
[3, 20]
training accuracy: 0.9272598870056498
training sensitivity: 0.34306569343065696
training specific

[10, 0]
training accuracy: 0.9305210918114144
training sensitivity: 0.391304347826087
training specificity: 0.9884583676834295
[10, 5]
training accuracy: 0.9306372549019608
training sensitivity: 0.389873417721519
training specificity: 0.9886024423337856
[10, 10]
training accuracy: 0.9300242130750606
training sensitivity: 0.38957816377171217
training specificity: 0.9884625704319828
[10, 15]
training accuracy: 0.9294258373205742
training sensitivity: 0.38875305623471884
training specificity: 0.9880668257756563
[10, 20]
training accuracy: 0.9300236406619385
training sensitivity: 0.3932038834951456
training specificity: 0.9879518072289156
[10, 25]
training accuracy: 0.930841121495327
training sensitivity: 0.3932038834951456
training specificity: 0.9881075491209927
[10, 30]
training accuracy: 0.9302540415704388
training sensitivity: 0.3919239904988123
training specificity: 0.988232284471732
[10, 35]
training accuracy: 0.9305936073059361
training sensitivity: 0.39622641509433965
training spe

[17, 40]
training accuracy: 0.9758530942741469
training sensitivity: 0.3945868945868946
training specificity: 0.9905111723293542
[18, 0]
training accuracy: 0.9326524979298924
training sensitivity: 0.3940256045519203
training specificity: 0.9905242243619135
[18, 5]
training accuracy: 0.9324287280701754
training sensitivity: 0.3929577464788732
training specificity: 0.9905860917096873
[18, 10]
training accuracy: 0.9323441328614211
training sensitivity: 0.3916083916083916
training specificity: 0.9906499773789775
[18, 15]
training accuracy: 0.9318550567874527
training sensitivity: 0.38865836791147995
training specificity: 0.990708826614716
[18, 20]
training accuracy: 0.9323126510878323
training sensitivity: 0.3903448275862069
training specificity: 0.990775182264544
[18, 25]
training accuracy: 0.9324973319103522
training sensitivity: 0.3917808219178082
training specificity: 0.9908365356192729
[18, 30]
training accuracy: 0.9325470447919427
training sensitivity: 0.3923705722070845
training spe

[5, 10]
training accuracy: 0.9117924528301887
training sensitivity: 0.29535864978902954
training specificity: 0.9893786510886883
[5, 15]
training accuracy: 0.9124423963133641
training sensitivity: 0.29583333333333334
training specificity: 0.989119170984456
[5, 20]
training accuracy: 0.913963963963964
training sensitivity: 0.2975206611570248
training specificity: 0.9893832153690597
[5, 25]
training accuracy: 0.9145374449339208
training sensitivity: 0.3132530120481928
training specificity: 0.9886194952993568
[5, 30]
training accuracy: 0.9146551724137931
training sensitivity: 0.31906614785992216
training specificity: 0.9888511875908871
[5, 35]
training accuracy: 0.9130801687763713
training sensitivity: 0.31835205992509363
training specificity: 0.9885877318116976
[5, 40]
training accuracy: 1.0535372848948374
training sensitivity: 0.31851851851851853
training specificity: 0.988795518207283
[6, 0]
training accuracy: 0.9132947976878613
training sensitivity: 0.32116788321167883
training specif

[12, 25]
training accuracy: 0.9183713611329661
training sensitivity: 0.36300174520069806
training specificity: 0.9889159831522943
[12, 30]
training accuracy: 0.9179976626412154
training sensitivity: 0.35986159169550175
training specificity: 0.9888059701492538
[12, 35]
training accuracy: 0.9182098765432098
training sensitivity: 0.3602058319039451
training specificity: 0.988915453162356
[12, 40]
training accuracy: 0.9785976355483081
training sensitivity: 0.3606837606837607
training specificity: 0.989010989010989
[13, 0]
training accuracy: 0.9188311688311688
training sensitivity: 0.3606837606837607
training specificity: 0.9890346162115674
[13, 5]
training accuracy: 0.9190314037079077
training sensitivity: 0.3610169491525424
training specificity: 0.9891396933560477
[13, 10]
training accuracy: 0.9194152923538231
training sensitivity: 0.3630252100840336
training specificity: 0.9892427757856992
[13, 15]
training accuracy: 0.9192350538432974
training sensitivity: 0.36318407960199006
training s

[19, 35]
training accuracy: 0.9208552138034508
training sensitivity: 0.3668903803131991
training specificity: 0.9905686936936937
[19, 40]
training accuracy: 0.9589378238341969
training sensitivity: 0.36777777777777776
training specificity: 0.9904761904761905
0.07111874222755432
[0.875, 0.8, 0.9666666666666667]
[0.1410168707370758, 0.875, 0.25, 1.0]
[0, 0]
training accuracy: 0.9
training sensitivity: 0.0
training specificity: 1.0
[0, 5]
training accuracy: 0.8833333333333333
training sensitivity: 0.0
training specificity: 0.9814814814814815
[0, 10]
training accuracy: 0.8909090909090909
training sensitivity: 0.0
training specificity: 0.98989898989899
[0, 15]
training accuracy: 0.9125
training sensitivity: 0.0
training specificity: 0.9931972789115646
[0, 20]
training accuracy: 0.9
training sensitivity: 0.0
training specificity: 0.9947368421052631
[0, 25]
training accuracy: 0.9
training sensitivity: 0.0
training specificity: 0.9957446808510638
[0, 30]
training accuracy: 0.9
training sensiti

[7, 15]
training accuracy: 0.9314055144586416
training sensitivity: 0.3741258741258741
training specificity: 0.9906994047619048
[7, 20]
training accuracy: 0.9315476190476191
training sensitivity: 0.3758620689655172
training specificity: 0.9904901243599122
[7, 25]
training accuracy: 0.9313597918022121
training sensitivity: 0.375
training specificity: 0.9906407487401008
[7, 30]
training accuracy: 0.9308578745198464
training sensitivity: 0.37293729372937295
training specificity: 0.9907834101382489
[7, 35]
training accuracy: 0.9306868304977945
training sensitivity: 0.37540453074433655
training specificity: 0.9905759162303664
[7, 40]
training accuracy: 1.0341850828729282
training sensitivity: 0.3782051282051282
training specificity: 0.9907024793388429
[8, 0]
training accuracy: 0.9314941103533788
training sensitivity: 0.3821656050955414
training specificity: 0.990728021978022
[8, 5]
training accuracy: 0.931929181929182
training sensitivity: 0.3836477987421384
training specificity: 0.99087221

KeyboardInterrupt: 

<Figure size 432x288 with 0 Axes>