In [2]:
import pandas as pd
import numpy as np
import os

from collections import Counter
import copy

from sklearn.utils import resample
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore",category=UserWarning, append=True)

import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline

# input data

## clinical data

In [2]:
outcome = 'died'
path_in = '/data/clinic_europ_804_sim.csv'
clinic = pd.read_csv(path_in)
print(f"dimension of clinical data: {clinic.shape}")
print(clinic[outcome].value_counts())

dimension of clinical data: (804, 6)
died
0    742
1     62
Name: count, dtype: int64


## seq data

In [3]:
# total
path_in = '/data/voom_sim_disc.csv'
voom = pd.read_csv(path_in)
print(f"dimension of voom transformed dataset: {voom.shape}")
lnc_all = voom['lnc'].values
print(f"total lncRNAs: {lnc_all.shape[0]}")
voom.index = voom['lnc'].values
voom = voom.drop(columns='lnc')


dimension of voom transformed dataset: (2906, 805)
total lncRNAs: 2906


## merge whole dataset

In [4]:
voom_t = voom.transpose()
voom_t['ID'] = voom_t.index.values
clinic_seq = clinic.merge(voom_t, on = 'ID')
print(f"dimension of entire training dataset: {clinic_seq.shape}")
print(clinic_seq[outcome].value_counts())

dimension of entire training dataset: (804, 2912)
died
0    742
1     62
Name: count, dtype: int64


In [5]:
dat_use = clinic_seq.copy()
dat_use.index = dat_use['ID'].values

outcome = 'died'
samples_1 = dat_use.loc[dat_use[outcome] == 1, 'ID'].values
print(f'sample size group 1: {samples_1.shape}')

samples_0 = dat_use.loc[dat_use[outcome] == 0, 'ID'].values
print(f'sample size group 0: {samples_0.shape}')

sample size group 1: (62,)
sample size group 0: (742,)


# classification

## selected varaibles

In [7]:
df_sel = pd.read_excel('/results/features_sel_numbers_df000001_disc_died_08_100_.xlsx')
df_sel
var_sel = df_sel.loc[df_sel['n'] > 70,'feature'].values
print(var_sel)

['age' 'SEQ0235']


## split index

In [10]:
train_test = pd.read_csv('/data/idx_resample.csv')
print(train_test.shape)
train_test

(804, 101)


Unnamed: 0,train_1,train_2,train_3,train_4,train_5,train_6,train_7,train_8,train_9,train_10,...,train_92,train_93,train_94,train_95,train_96,train_97,train_98,train_99,train_100,ID
0,0,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,discovery_001
1,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,0,2,2,discovery_002
2,2,2,2,2,1,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,discovery_003
3,2,2,1,2,1,2,2,2,2,1,...,2,2,2,2,2,0,2,2,2,discovery_004
4,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,discovery_005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799,2,2,2,2,2,2,0,0,2,2,...,2,2,1,2,2,2,2,2,2,discovery_800
800,2,2,2,2,2,1,2,2,2,2,...,2,2,2,2,1,2,2,2,2,discovery_801
801,2,2,2,2,2,2,2,2,2,1,...,2,2,2,2,2,2,2,2,2,discovery_802
802,2,2,0,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,discovery_803


## list of classifier

In [None]:
%run tune_grid.ipynb
print(f'number of classifiers: {len(lst_clf)}')
print(lst_clf)

## balanced dataset

In [None]:
%run cv_tune_func.ipynb

In [None]:
n_jobs=12
random_state=123
kf_inner = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=random_state)
df_mean = pd.DataFrame()
df_sd = pd.DataFrame()
for key, clf in d.items(lst_clf):
    res = cv_tune_bl_idx(var_=var_sel, key=key, clf=clf, split_idx, df_features, scoring='roc_auc', output_prefix = 'cv_tune_disc_bl', n_jobs=n_jobs, random_state=random_state, folder_out='/results/', kf_inner=kf_inner)

    mean_ = res.mean(axis=0)
    mean_['classifier'] = key
    df_mean = pd.concat([df_mean, pd.DataFrame.from_dict(mean_).transpose()], ignore_index=True, axis=0)
    print(df_mean)

    sd_ = res.std(axis=0)
    sd_['classifier'] = key
    df_sd = pd.concat([df_sd,pd.DataFrame.from_dict(sd_).transpose()], ignore_index=True, axis=0)
    print(df_sd)

## imbalanced dataset

In [None]:
n_jobs=12
random_state=123
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=20, random_state=random_state)
kf_inner = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=random_state)
df_mean = pd.DataFrame()
df_sd = pd.DataFrame()
for key, clf in d.items(lst_clf):
    res = cv_tune_imbl(var_=var_sel, key=key, clf=clf, split_idx, df_features, scoring='roc_auc', output_prefix = 'cv_tune_disc_imbl', n_jobs=n_jobs, random_state=random_state, folder_out='/results/', kf_inner=kf_inner, kf=kf)

    mean_ = res.mean(axis=0)
    mean_['classifier'] = key
    df_mean = pd.concat([df_mean, pd.DataFrame.from_dict(mean_).transpose()], ignore_index=True, axis=0)
    print(df_mean)

    sd_ = res.std(axis=0)
    sd_['classifier'] = key
    df_sd = pd.concat([df_sd,pd.DataFrame.from_dict(sd_).transpose()], ignore_index=True, axis=0)
    print(df_sd)