In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import KFold,StratifiedKFold
import numpy as np
from pathlib import Path
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
import random
import os

In [2]:
SEED =42
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [3]:
log_dir ='datasets'
BATCH_ID = 2
with_anxiety=""

In [4]:
data=pd.read_csv(log_dir+f'/mdd_data{BATCH_ID}{with_anxiety}.csv')
data =data.sample(frac=1).reset_index(drop=True)

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/mdd_data2.csv'

In [None]:
if with_anxiety:
    X =data.iloc[:,:-4]
    y= data['Anxiety']
else:    
    X =data.iloc[:,:-2]
    y= data['diagnosis']

In [None]:
encoder =LabelEncoder()
y=encoder.fit_transform(y)

In [5]:
def importance_features(columns, coef, count):
    imp_features =sorted(list(zip(columns,coef)), key =lambda x: np.abs(x[-1]), reverse=True)
    return imp_features[:count]                     

In [8]:
class TopFeatures:
    
    def __init__(self):
        self.top_features =dict()
        
    def top_cross_val_features(self,important_features):

        for elem in important_features:
            if elem[0] in self.top_features:
                self.top_features[elem[0]]+=1
            else:
                self.top_features[elem[0]]=1

In [9]:
predicted_values,true_values,prob_values = [], [],[]
acc_scores,f1s, rocs =[],[],[]
tp =TopFeatures()
for i in range(10):
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        if BATCH_ID ==1:
            model =LogisticRegression(penalty='l1',solver='liblinear',C =1, random_state=42)
        else:    
            model =LogisticRegression(penalty='l2',solver='liblinear',C =1,random_state=42)
        
        model.fit(X_train, y_train)
        y_predicted =model.predict_proba(X_test)
        y_predicted_binary =model.predict(X_test)
        
        acc_score =accuracy_score(y_test, y_predicted_binary)
        
        auc =roc_auc_score(y_test, y_predicted[:, 1])
        f1 =f1_score(y_test, y_predicted_binary, average='macro')
        acc_scores.append(acc_score)
        rocs.append(auc)
        f1s.append(f1)
        predicted_values.append(y_predicted_binary)
        true_values.append(y_test)
        prob_values.append(y_predicted)
        
        imp_features_top10 =importance_features(X.columns.to_list(), model.coef_[0], 10)
        tp.top_cross_val_features(imp_features_top10)
        
true_values = np.concatenate(true_values)
predicted_values = np.concatenate(predicted_values)
prob_values = np.concatenate(prob_values) 

accuracy = accuracy_score(true_values, predicted_values)
mean_auc = roc_auc_score(true_values, prob_values[:, 1])
f1 =f1_score(true_values, predicted_values, average='macro')
print(f'Accuracy:  {acc_scores.mean()} +- {acc_scores.std()}')
print(f'ROC AUC:  {rocs.mean()}+- {rocs.std()}')
print(f'F1:  {f1s.mean()}+-{f1s.std()}')        

Accuracy:  0.7272727272727273
ROC AUC:  0.7423076923076923
F1:  0.6612193588937776


In [10]:
top_features ={k: v for k, v in sorted(tp.top_features.items(), key=lambda item: item[1], reverse=True)}

In [11]:
top_features

{'CLEC12B': 100,
 'ERAP2': 90,
 'CYP4F35P': 80,
 'CHI3L1': 70,
 'SLC26A8': 70,
 'PROS1': 70,
 'PAX8-AS1': 60,
 'PF4V1': 50,
 'LGALS2': 50,
 'PSPH': 50,
 'ARG1': 40,
 'SRSF6': 40,
 'PPAT': 20,
 'OLFM4': 20,
 'SLC8A1-AS1': 20,
 'XIST': 10,
 'SYNM': 10,
 'HLA-DQB1': 10,
 'B3GLCT': 10,
 'NRG1': 10,
 'MXRA7': 10,
 'ZCCHC2': 10,
 'CCL3': 10,
 'DEFA4': 10,
 'DOCK4': 10,
 'PLEKHG1': 10,
 'PDK4': 10,
 'BTNL8': 10,
 'ENSG00000188002': 10,
 'TREML4': 10,
 'PIGC': 10,
 'COLGALT2': 10}

In [17]:
pd.Series(top_features).to_excel(f'results/importance_features_{BATCH_ID}{with_anxiety}.xls')

  pd.Series(top_features).to_excel(f'results/importance_features_{BATCH_ID}{with_anxiety}.xls')
