In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from scipy import stats
np.random.seed(12345678)

In [None]:
data = pd.read_csv('gene_matrix_CNV.csv.gz', compression='gzip')

In [None]:
data_nan = data.dropna()

In [None]:
data_nan.to_csv('gene_matrix_CNV_noNA.csv.gz', index=False, compression='gzip')

# Differentialy expressed Genes with Copy Number

In [None]:
data = pd.read_csv('gene_matrix_CNV_noNA.csv.gz', compression='gzip')

In [None]:
import os

os.mkdir('p-values-matrix')
os.mkdir('train_degs')

In [None]:
path = 'Splits_10CV/'
for split in range(0,10):
    print('{}/{}'.format(split,9))
    train_f = open(path+'train_'+str(split)+'.txt', 'r')
    train_caseids = train_f.readlines()
    train_f.close()
    val_f = open(path+'val_'+str(split)+'.txt', 'r')
    val_caseids = val_f.readlines()
    val_f.close()

    train_cids = []
    for cid in train_caseids:
        train_cids.append(cid.replace('\n', ''))

    val_cids = []
    for cid in val_caseids:
        val_cids.append(cid.replace('\n', '')) 
        
    train_final = []
    for i in range(len(list(data.columns))):
        resu = re.match('|'.join(train_cids),list(data.columns)[i])
        if resu:
            if resu.group(0) != '':
                train_final.append(i)

    val_final = []
    for i in range(len(list(data.columns))):
        resu = re.match('|'.join(val_cids),list(data.columns)[i])
        if resu:
            if resu.group(0) != '':
                val_final.append(i)

    train_final.insert(0, 1)
    val_final.insert(0, 1)
    df_train = data.iloc[:,train_final]
    df_val = data.iloc[:,val_final]
    labels_train = df_train.iloc[-1,:].values
    labels_train = np.where(labels_train == 'Blood Derived Normal', 'Healthy', labels_train)
    labels_train = np.where(labels_train == 'Solid Tissue Normal', 'Healthy', labels_train)

    hlt_idx = np.where(labels_train == 'Healthy')[0]
    luad_idx = np.where(labels_train == 'LUAD')[0]
    lusc_idx = np.where(labels_train == 'LUSC')[0]
    
    # p-values and means
    p_value_hltluad = []
    p_value_hltlusc = []
    p_value_luscluad = []
    mean_dif_hltluad = []
    mean_dif_hltlusc = []
    mean_dif_luscluad = []
    for _, row in tqdm(df_train.iterrows()):
        if row[0] == 'X':
            continue
        comp_counter = 0
        
        hlt = row[hlt_idx].to_numpy(dtype='float')
        luad = row[luad_idx].to_numpy(dtype='float')
        lusc = row[lusc_idx].to_numpy(dtype='float')

        stats_hltluad = stats.ttest_ind(hlt,luad,equal_var=False)
        stats_hltlusc = stats.ttest_ind(hlt,lusc,equal_var=False)
        stats_luscluad = stats.ttest_ind(lusc,luad,equal_var=False)

        mean_dif_hltluad.append(np.abs(np.mean(hlt) - np.mean(luad)))
        mean_dif_hltlusc.append(np.abs(np.mean(hlt) - np.mean(lusc)))
        mean_dif_luscluad.append(np.abs(np.mean(lusc) - np.mean(luad)))

        p_value_hltluad.append(stats_hltluad.pvalue)
        p_value_hltlusc.append(stats_hltlusc.pvalue)
        p_value_luscluad.append(stats_luscluad.pvalue)
    
    p_values_df = pd.DataFrame()
    p_values_df['gene_name'] = df_train['gene_name'].values[:-1]
    p_values_df['HltLusc'] = p_value_hltlusc
    p_values_df['HltLuad'] = p_value_hltluad
    p_values_df['LuscLuad'] = p_value_luscluad
    p_values_df['MeanHltLuad'] = mean_dif_hltluad
    p_values_df['MeanHltLusc'] = mean_dif_hltlusc
    p_values_df['MeanLuscLuad'] = mean_dif_luscluad

    p_values_df.to_csv('p-values-matrix/p_values_train'+str(split)+'.csv', index=False)
    decpgs = []
    for _, row in tqdm(p_values_df.iterrows()):
        comp_counter = 0
        if row['HltLuad'] <= 0.001/3 and row['MeanHltLuad'] >= 0.4:
                comp_counter += 1
        if row['HltLusc'] <= 0.001/3 and row['MeanHltLusc'] >= 0.4:
            comp_counter += 1
        if row['LuscLuad'] <= 0.001/3 and row['MeanLuscLuad'] >= 0.4:
            comp_counter += 1

        if comp_counter == 2:
            decpgs.append(row[0])
    
    decpgs.append('X')
    
    train_degs = df_train.loc[df_train['gene_name'].isin(decpgs)]
    train_degs.to_csv('train_degs/DEGs_CNV_train'+str(split)+'_p0-001_cov2.csv', index=False)

In [None]:
from glob import glob
for file in sorted(glob('train_degs/*.csv')):
    train_data = pd.read_csv(file)
    train_data = train_data.replace('Blood Derived Normal', 'Healthy')
    train_data = train_data.replace('Solid Tissue Normal', 'Healthy')
    train_data = train_data.replace('X', 'Label')
    name = file.split('/')[-1]
    train_data.to_csv('train_degs2/'+name, index=False)

In [None]:
os.mkdir('train_degs2')

## Machine Learning with first split

In [None]:
data_all = pd.read_csv('gene_matrix_CNV_noNA.csv.gz', compression='gzip')

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay, f1_score

classes = np.array(['Healthy', 'LUAD', 'LUSC'])

ohe = preprocessing.OneHotEncoder(sparse=False)
ohe.fit(classes.reshape(-1,1))

In [None]:
path = 'Splits_10CV/'
train_accs = []
test_accs = []
train_F1 = []
test_F1 = []

for i in range(10):
    print(10*'-')
    print('Split {}/{}'.format(i,10))
    print(10*'-')
    
    data = pd.read_csv('mrmrDEGs/mrmrDEGs_LC_CNV_3classes_split'+str(i)+'.csv')
    
    train_f = open(path+'train_'+str(i)+'.txt', 'r')
    train_caseids = train_f.readlines()
    train_f.close()
    val_f = open(path+'val_'+str(i)+'.txt', 'r')
    val_caseids = val_f.readlines()
    val_f.close()

    train_cids = []
    for cid in train_caseids:
        train_cids.append(cid.replace('\n', ''))

    val_cids = []
    for cid in val_caseids:
        val_cids.append(cid.replace('\n', '')) 

    train_final = []
    for i in range(len(list(data['Case_IDs'].values))):
        resu = re.match('|'.join(train_cids),list(data['Case_IDs'].values)[i])
        if resu:
            if resu.group(0) != '':
                train_final.append(i)

    val_final = []
    for i in range(len(list(data['Case_IDs'].values))):
        resu = re.match('|'.join(val_cids),list(data['Case_IDs'].values)[i])
        if resu:
            if resu.group(0) != '':
                val_final.append(i)

    #train_final.insert(0, 1)
    #val_final.insert(0, 1)
    df_train = data.iloc[train_final,]
    df_val = data.iloc[val_final,]

    case_ids_val = df_val['Case_IDs']
    val_df_all = data_all[case_ids_val]
    y_val = val_df_all.iloc[-1,:].values
    y_val = np.where(y_val == 'Blood Derived Normal', 'Healthy', y_val)
    y_val = np.where(y_val == 'Solid Tissue Normal', 'Healthy', y_val)

    case_ids_train = df_train['Case_IDs']
    train_df_all = data_all[case_ids_train]
    y_train = train_df_all.iloc[-1,:].values
    y_train = np.where(y_train == 'Blood Derived Normal', 'Healthy', y_train)
    y_train = np.where(y_train == 'Solid Tissue Normal', 'Healthy', y_train)
    
    x_train = df_train.iloc[:,1:7].values
    x_val = df_val.iloc[:,1:7].values
    y_train_ohe = ohe.transform(y_train.reshape(-1,1))
    y_val_ohe = ohe.transform(y_val.reshape(-1,1))
    
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7],
                        'C': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7]}]
    clf = GridSearchCV(
                    SVC(probability=True), tuned_parameters, scoring='accuracy'
                )

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    clf.fit(x_train, y_train_ohe.argmax(axis=1))
    print(clf.best_params_)
    best_params = clf.best_params_
    train_preds = clf.predict(x_train)
    corrects = np.sum(train_preds == y_train_ohe.argmax(axis=1))
    train_acc = (corrects / x_train.shape[0]) * 100
    train_f1 = f1_score(y_train_ohe.argmax(axis=1), train_preds, average='weighted', labels=[1, 0, 2])
    train_accs.append(train_acc)
    train_F1.append(train_f1)
    print('SVM train acc: {}'.format(train_acc))
    print('SVM train F1: {}'.format(train_f1))
    print('CM \n')
    print(confusion_matrix(y_train_ohe.argmax(axis=1), train_preds, labels=[1, 0, 2]))
    
    svm_ = SVC(**best_params)
    #print(clf.best_params_)
    x_val = scaler.transform(x_val)
    test_preds = clf.predict(x_val)
    corrects = np.sum(test_preds == y_val_ohe.argmax(axis=1))
    test_acc = (corrects / x_val.shape[0]) * 100
    test_f1 = f1_score(y_val_ohe.argmax(axis=1), test_preds, average='weighted', labels=[1, 0, 2])
    test_accs.append(test_acc)
    test_F1.append(test_f1)
    print('SVM test acc: {}'.format(test_acc))
    print('SVM test F1: {}'.format(test_f1))
    print('CM \n')
    print(confusion_matrix(y_val_ohe.argmax(axis=1), test_preds,labels=[1, 0, 2]))

In [None]:
print('Mean Acc in train: {}+-{}'.format(np.mean(train_accs),np.std(train_accs)))
print('Mean F1 in train: {}+-{}'.format(np.mean(train_F1)*100,np.std(train_F1)*100))
print(10*'-')
print('Mean Acc in test: {}+-{}'.format(np.mean(test_accs),np.std(test_accs)))
print('Mean F1 in test: {}+-{}'.format(np.mean(test_F1)*100,np.std(test_F1)*100))

In [None]:
x_train = df_train.iloc[:,1:7].values
x_val = df_val.iloc[:,1:7].values
y_train_ohe = ohe.transform(y_train.reshape(-1,1))
y_val_ohe = ohe.transform(y_val.reshape(-1,1))

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7],
                        'C': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7]}]
clf = GridSearchCV(
                SVC(probability=True), tuned_parameters, scoring='accuracy'
            )

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
clf.fit(x_train, y_train_ohe.argmax(axis=1))
print(clf.best_params_)
best_params = clf.best_params_
train_preds = clf.predict(x_train)
corrects = np.sum(train_preds == y_train_ohe.argmax(axis=1))
train_acc = (corrects / x_train.shape[0]) * 100
print('kNN train acc: {}'.format(train_acc))
print('CM \n')
print(confusion_matrix(y_train_ohe.argmax(axis=1), train_preds, labels=[1, 0, 2]))

In [None]:
cm = confusion_matrix(y_train_ohe.argmax(axis=1), train_preds, labels=[1, 0, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=['LUAD', 'Healthy', 'LUSC'])
disp.plot(cmap= "Blues")

In [None]:
svm_ = SVC(**best_params)
#print(clf.best_params_)
x_val = scaler.transform(x_val)
test_preds = clf.predict(x_val)
corrects = np.sum(test_preds == y_val_ohe.argmax(axis=1))
test_acc = (corrects / x_val.shape[0]) * 100
print('kNN test acc: {}'.format(test_acc))
print('CM \n')
print(confusion_matrix(y_val_ohe.argmax(axis=1), test_preds,labels=[1, 0, 2]))

In [None]:
cm = confusion_matrix(y_val_ohe.argmax(axis=1), test_preds, labels=[1, 0, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=['LUAD', 'Healthy', 'LUSC'])
disp.plot(cmap= "Blues")

## TSNE feature visualiztion

In [None]:
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt
X_embedded = TSNE(n_components=2).fit_transform(x_val)

hlt = np.where(y_val == 'Healthy')[0]
luad = np.where(y_val == 'LUAD')[0]
lusc = np.where(y_val == 'LUSC')[0]

X_embedded_hlt = X_embedded[hlt,:]
X_embedded_luad = X_embedded[luad,:]
X_embedded_lusc = X_embedded[lusc,:]

plt.figure()
plt.scatter(X_embedded_hlt[:, 0], X_embedded_hlt[:, 1], c='blue', label='HLT')
plt.scatter(X_embedded_luad[:, 0], X_embedded_luad[:, 1], c='green', label='LUAD')
plt.scatter(X_embedded_lusc[:, 0], X_embedded_lusc[:, 1], c='red', label='LUSC')
plt.legend()
plt.show()