In [1]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook
import re
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay, f1_score
import xgboost as xgb
from tqdm.notebook import tqdm

In [2]:
all_rna = pd.read_csv('../RNA-Seq/RNA-ExpAll-LC.csv.gz', compression='gzip')

FileNotFoundError: [Errno 2] No such file or directory: '../RNA-Seq/RNA-ExpAll-LC.csv.gz'

In [None]:
classes = np.array(['healthy', 'adeno', 'squa'])

ohe = preprocessing.OneHotEncoder(sparse=False)
ohe.fit(classes.reshape(-1,1))

In [None]:
def train(x_train, x_test, y_train_ohe, y_test_ohe, accs, f1s, name, early=False):
    preds = {
        'train': [],
        'test': []
    }
    probs = {
        'train': [],
        'test': []
    }
    tuned_parameters = [{'max_depth': [2, 4, 6, 8],
                            'n_estimators': [20, 30, 50, 100, 200],
                         'alpha': [0,0.1,0.2,0.3]}]
    
    
    clf = GridSearchCV(
                    xgb.XGBClassifier(n_jobs=1,use_label_encoder=False,verbosity = 0, random_state=42), tuned_parameters, 
                    scoring='accuracy'
                )
    if early:
        classes_ = [0,1,2]
        x_train_new, y_train_ohe_new, x_val, y_val = get_val_set(x_train, y_train_ohe, classes_, percentage = 0.1)
    else:
        x_train_new = x_train
        y_train_ohe_new = y_train_ohe
    
    scaler = StandardScaler()
    x_train_new = scaler.fit_transform(x_train_new)
    if early:
        eval_set=[(scaler.transform(x_val), y_val.argmax(axis=1))]
        clf.fit(x_train_new, y_train_ohe_new.argmax(axis=1), early_stopping_rounds=10, eval_set=eval_set, verbose=False)
    else:
        clf.fit(x_train_new, y_train_ohe_new.argmax(axis=1))
    print(clf.best_params_)
    x_train_new = scaler.transform(x_train)
    y_train_ohe_new = y_train_ohe

    best_params = clf.best_params_
    train_preds = clf.predict(x_train_new)
    preds['train'] = train_preds
    corrects = np.sum(train_preds == y_train_ohe_new.argmax(axis=1))
    train_acc = (corrects / x_train_new.shape[0]) * 100
    train_f1 = f1_score(y_train_ohe_new.argmax(axis=1), train_preds, average='weighted')
    accs['train'][name].append(train_acc)
    f1s['train'][name].append(train_f1)
    train_probs = clf.predict_proba(x_train_new)
    probs['train'] = train_probs
    print('{} Train acc: {}'.format(name, train_acc))
    print('{} Train F1: {}'.format(name, train_f1))
    print('CM \n')
    print(confusion_matrix(y_train_ohe_new.argmax(axis=1), train_preds))

    #svm_ = SVC(**best_params)
    #print(clf.best_params_)
    x_test = scaler.transform(x_test)
    test_preds = clf.predict(x_test)
    preds['test'] = test_preds
    corrects = np.sum(test_preds == y_test_ohe.argmax(axis=1))
    test_acc = (corrects / x_test.shape[0]) * 100
    test_f1 = f1_score(y_test_ohe.argmax(axis=1), test_preds, average='weighted')
    accs['test'][name].append(test_acc)
    f1s['test'][name].append(test_f1)
    test_probs = clf.predict_proba(x_test)
    probs['test'] = test_probs
    print('{} test acc: {}'.format(name, test_acc))
    print('{} test F1: {}'.format(name, test_f1))
    print('CM \n')
    print(confusion_matrix(y_test_ohe.argmax(axis=1), test_preds))
    
    return accs, f1s, probs, preds

def get_val_set(x, y, classes, percentage = 0.1):
    np.random.seed(42)  
    x_train = np.array([]).reshape(0,x.shape[1])
    y_train = np.array([]).reshape(0,y.shape[1])
    x_val = np.array([]).reshape(0,x.shape[1])
    y_val = np.array([]).reshape(0,y.shape[1])
    for c in classes:
        indexes = np.where(y.argmax(axis=1) == c)[0]
        np.random.shuffle(indexes)
        len_val = int(percentage * len(indexes))
        len_train = len(indexes) - len_val
        index_train = indexes[0:len_train]
        index_val = indexes[len_train:]
        x_train = np.concatenate([x_train, x[index_train,...]], axis=0)
        y_train = np.concatenate([y_train, y[index_train]], axis=0)
        x_val = np.concatenate([x_val, x[index_val,...]], axis=0)
        y_val = np.concatenate([y_val, y[index_val]], axis=0)
    
    index_train = list(range(x_train.shape[0]))
    index_val = list(range(x_val.shape[0]))
    np.random.shuffle(index_train)
    np.random.shuffle(index_val)
    
    return x_train[index_train,...],y_train[index_train], x_val[index_val,...], y_val[index_val]

In [None]:
splits = 10
path_rna_mrmr = '../RNA-Seq/mrmrDEGs/'
path_cnv_mrmr = '../Copy-Number-Variation/mrmrDEGs/'
n_genes_rna = 6
n_genes_cnv = 12

path = '../Copy-Number-Variation/Splits_10CV/'

accs = {
    'train': {'Integration': [], 'RNA': [], 'CNV': []},
    'test': {'Integration': [], 'RNA': [], 'CNV': []}
}
f1s = {
    'train': {'Integration': [], 'RNA': [], 'CNV': []},
    'test': {'Integration': [], 'RNA': [], 'CNV': []}
}

preds_all = {
    'train': {'Integration': np.array([]), 'RNA': np.array([]), 'CNV': np.array([])},
    'test': {'Integration': np.array([]), 'RNA': np.array([]), 'CNV': np.array([])}
}

probs_all = {
    'train': {'Integration': np.array([[],[],[]]), 'RNA': np.array([[],[],[]]), 'CNV': np.array([[],[],[]])},
    'test': {'Integration': np.array([[],[],[]]), 'RNA': np.array([[],[],[]]), 'CNV': np.array([[],[],[]])}
}

save_xlsx = False
if save_xlsx:
    writer_train = pd.ExcelWriter('early_integration/RNA6_CNV12_train.xlsx', engine='openpyxl')
    writer_test = pd.ExcelWriter('early_integration/RNA6_CNV12_test.xlsx', engine='openpyxl')
    
for split in tqdm(range(splits)):
    df_rna = pd.read_csv(path_rna_mrmr+'mrmrDEGs_LC_3classes_split'+str(split)+'.csv')
    df_cnv = pd.read_csv(path_cnv_mrmr+'mrmrDEGs_LC_CNV_3classes_p0-001_m0-1_cov3_split'+str(split)+'.csv')
    rna_columns_keep = df_rna.columns.values[0:n_genes_rna+1].tolist()
    cnv_columns_keep = df_cnv.columns.values[0:n_genes_cnv+1].tolist()
    df_rna = df_rna[rna_columns_keep]
    df_cnv = df_cnv[cnv_columns_keep]
    data = df_rna.set_index('Case_IDs').join(df_cnv.set_index('Case_IDs'))
    data = data.dropna()
    data.reset_index(inplace=True)
    
    train_f = open(path+'train_'+str(split)+'.txt', 'r')
    train_caseids = train_f.readlines()
    train_f.close()
    val_f = open(path+'val_'+str(split)+'.txt', 'r')
    val_caseids = val_f.readlines()
    val_f.close()

    train_cids = []
    for cid in train_caseids:
        train_cids.append(cid.replace('\n', ''))

    val_cids = []
    for cid in val_caseids:
        val_cids.append(cid.replace('\n', '')) 

    train_final = []
    for i in range(len(list(data['Case_IDs'].values))):
        resu = re.match('|'.join(train_cids),list(data['Case_IDs'].values)[i])
        if resu:
            if resu.group(0) != '':
                train_final.append(i)

    val_final = []
    for j in range(len(list(data['Case_IDs'].values))):
        resu = re.match('|'.join(val_cids),list(data['Case_IDs'].values)[j])
        if resu:
            if resu.group(0) != '':
                val_final.append(j)

    #train_final.insert(0, 1)
    #val_final.insert(0, 1)
    df_train = data.iloc[train_final,]
    df_val = data.iloc[val_final,]
    
    case_ids_val = df_val['Case_IDs']
    #val_df_all = data_all[case_ids_val]
    y_val = all_rna['labelsAll'].loc[all_rna['Case_IDs'].isin(case_ids_val)].values
    #y_val = np.where(y_val == 'Blood Derived Normal', 'healthy', y_val)
    y_val = np.where(y_val == 'Healthy', 'healthy', y_val)
    y_val = np.where(y_val == 'Adenocarcinoma', 'adeno', y_val)
    y_val = np.where(y_val == 'Squamous', 'squa', y_val)
    
    case_ids_train = df_train['Case_IDs']
    #train_df_all = data_all[case_ids_train]
    y_train = all_rna['labelsAll'].loc[all_rna['Case_IDs'].isin(case_ids_train)].values
    #y_train = np.where(y_train == 'Blood Derived Normal', 'healthy', y_train)
    y_train = np.where(y_train == 'Healthy', 'healthy', y_train)
    y_train = np.where(y_train == 'Adenocarcinoma', 'adeno', y_train)
    y_train = np.where(y_train == 'Squamous', 'squa', y_train)
    
    x_train = df_train.iloc[:,1:].values
    x_val = df_val.iloc[:,1:].values
    
    y_train_ohe = ohe.transform(y_train.reshape(-1,1))
    y_val_ohe = ohe.transform(y_val.reshape(-1,1))
    
    x_train_rna = df_train.iloc[:, 1:n_genes_rna+1].values
    x_val_rna = df_val.iloc[:, 1:n_genes_rna+1].values
    
    x_train_cnv = df_train.iloc[:, n_genes_rna+1:].values
    x_val_cnv = df_val.iloc[:, n_genes_rna+1:].values
    
    print('End data read...')
    
    print('RNA training...')
    name = 'RNA'
    accs, f1s, probs_rna, preds_rna = train(x_train_rna, x_val_rna, y_train_ohe, y_val_ohe, accs, f1s, name, early=True)
    preds_all['train'][name] = np.concatenate([preds_all['train'][name], preds_rna['train']], axis=0)
    preds_all['test'][name] = np.concatenate([preds_all['test'][name], preds_rna['test']], axis=0)
    
    print('CNV training...')
    name = 'CNV'
    accs, f1s, probs_cnv, preds_cnv = train(x_train_cnv, x_val_cnv, y_train_ohe, y_val_ohe, accs, f1s, name, early=True)
    preds_all['train'][name] = np.concatenate([preds_all['train'][name], preds_cnv['train']], axis=0)
    preds_all['test'][name] = np.concatenate([preds_all['test'][name], preds_cnv['test']], axis=0)
    
    print('Integration training...')
    name = 'Integration'
    accs, f1s, probs_int, preds_int = train(x_train, x_val, y_train_ohe, y_val_ohe, accs, f1s, name, early=True)
    preds_all['train'][name] = np.concatenate([preds_all['train'][name], preds_int['train']], axis=0)
    preds_all['test'][name] = np.concatenate([preds_all['test'][name], preds_int['test']], axis=0)
    
    if save_xlsx:
        # save test
        data_test = pd.DataFrame()
        data_test['Case IDs'] = case_ids_val
        data_test['Has RNA'] = np.ones(len(case_ids_val))
        data_test['Has CNV'] = np.ones(len(case_ids_val))
        data_test['RNA Prob LUAD'] = probs_rna['test'][:,0]
        data_test['RNA Prob HLT'] = probs_rna['test'][:,1]
        data_test['RNA Prob LUSC'] = probs_rna['test'][:,2]
        data_test['RNA Pred'] = preds_rna['test']
        data_test['CNV Prob LUAD'] = probs_cnv['test'][:,0]
        data_test['CNV Prob HLT'] = probs_cnv['test'][:,1]
        data_test['CNV Prob LUSC'] = probs_cnv['test'][:,2]
        data_test['CNV Pred'] = preds_cnv['test']
        data_test['Intregation Prob LUAD'] = probs_int['test'][:,0]
        data_test['Integration Prob HLT'] = probs_int['test'][:,1]
        data_test['Integration Prob LUSC'] = probs_int['test'][:,2]
        data_test['Integration Pred'] = preds_int['test']
        data_test['Real'] = y_val_ohe.argmax(axis=1)
        data_test.to_excel(writer_test, sheet_name='split_'+str(split), index=False)

        # save train
        data_train = pd.DataFrame()
        data_train['Case IDs'] = case_ids_train
        data_test['Has RNA'] = np.ones(len(case_ids_train))
        data_test['Has CNV'] = np.ones(len(case_ids_train))
        data_train['RNA Prob LUAD'] = probs_rna['train'][:,0]
        data_train['RNA Prob HLT'] = probs_rna['train'][:,1]
        data_train['RNA Prob LUSC'] = probs_rna['train'][:,2]
        data_train['RNA Pred'] = preds_rna['train']
        data_train['CNV Prob LUAD'] = probs_cnv['train'][:,0]
        data_train['CNV Prob HLT'] = probs_cnv['train'][:,1]
        data_train['CNV Prob LUSC'] = probs_cnv['train'][:,2]
        data_train['CNV Pred'] = preds_cnv['train']
        data_train['Integration Prob LUAD'] = probs_int['train'][:,0]
        data_train['Integration Prob HLT'] = probs_int['train'][:,1]
        data_train['Integration Prob LUSC'] = probs_int['train'][:,2]
        data_train['Integration Pred'] = preds_int['train']
        data_train['Real'] = y_train_ohe.argmax(axis=1)
        data_train.to_excel(writer_train, sheet_name='split_'+str(split), index=False)
    
if save_xlsx:
    writer_train.close()
    writer_test.close()        