In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
import re
np.random.seed(12345678)

In [None]:
data = pd.read_csv('CpGs_450_matrix.csv')

In [None]:
del data["TCGA-44-2656-01A"]

data = data.dropna()



In [None]:
data.to_csv('CpGs_450_matrix_nonans.csv', index=False)

In [None]:
data = pd.read_csv('CpGs_450_matrix_nonans.csv')

In [None]:
path = 'Splits_10CV_miRNA/'
for split in range(1,10):
    print('{}/{}'.format(split,9))
    train_f = open(path+'train_'+str(split)+'.txt', 'r')
    train_caseids = train_f.readlines()
    train_f.close()
    val_f = open(path+'val_'+str(split)+'.txt', 'r')
    val_caseids = val_f.readlines()
    val_f.close()

    train_cids = []
    for cid in train_caseids:
        train_cids.append(cid.replace('\n', ''))

    val_cids = []
    for cid in val_caseids:
        val_cids.append(cid.replace('\n', '')) 
        
    train_final = []
    for i in range(len(list(data.columns))):
        resu = re.match('|'.join(train_cids),list(data.columns)[i])
        if resu:
            if resu.group(0) != '':
                train_final.append(i)

    val_final = []
    for i in range(len(list(data.columns))):
        resu = re.match('|'.join(val_cids),list(data.columns)[i])
        if resu:
            if resu.group(0) != '':
                val_final.append(i)

    train_final.insert(0, 0)
    val_final.insert(0, 0)
    df_train = data.iloc[:,train_final]
    df_val = data.iloc[:,val_final]
    labels_train = df_train.iloc[-1,:].values
    hlt_idx = np.where(labels_train == 'Healthy')[0]
    luad_idx = np.where(labels_train == 'LUAD')[0]
    lusc_idx = np.where(labels_train == 'LUSC')[0]
    
    # p-values and means
    p_value_hltluad = []
    p_value_hltlusc = []
    p_value_luscluad = []
    mean_dif_hltluad = []
    mean_dif_hltlusc = []
    mean_dif_luscluad = []
    for _, row in tqdm(df_train.iterrows()):
        if row[0] == 'Label':
            continue
        comp_counter = 0

        hlt = row[hlt_idx].to_numpy(dtype='float')
        luad = row[luad_idx].to_numpy(dtype='float')
        lusc = row[lusc_idx].to_numpy(dtype='float')

        stats_hltluad = stats.ttest_ind(hlt,luad,equal_var=False)
        stats_hltlusc = stats.ttest_ind(hlt,lusc,equal_var=False)
        stats_luscluad = stats.ttest_ind(lusc,luad,equal_var=False)

        mean_dif_hltluad.append(np.abs(np.mean(hlt) - np.mean(luad)))
        mean_dif_hltlusc.append(np.abs(np.mean(hlt) - np.mean(lusc)))
        mean_dif_luscluad.append(np.abs(np.mean(lusc) - np.mean(luad)))

        p_value_hltluad.append(stats_hltluad.pvalue)
        p_value_hltlusc.append(stats_hltlusc.pvalue)
        p_value_luscluad.append(stats_luscluad.pvalue)
    
    p_values_df = pd.DataFrame()
    p_values_df['CpGs'] = df_train['CpGs'].values[:-1]
    p_values_df['HltLusc'] = p_value_hltlusc
    p_values_df['HltLuad'] = p_value_hltluad
    p_values_df['LuscLuad'] = p_value_luscluad
    p_values_df['MeanHltLuad'] = mean_dif_hltluad
    p_values_df['MeanHltLusc'] = mean_dif_hltlusc
    p_values_df['MeanLuscLuad'] = mean_dif_luscluad

    p_values_df.to_csv('p-values-matrix/p_values_train'+str(split)+'.csv', index=False)
    decpgs = []
    for _, row in tqdm(p_values_df.iterrows()):
        comp_counter = 0
        if row['HltLuad'] <= 0.001/3 and row['MeanHltLuad'] >= 0.4:
                comp_counter += 1
        if row['HltLusc'] <= 0.001/3 and row['MeanHltLusc'] >= 0.4:
            comp_counter += 1
        if row['LuscLuad'] <= 0.001/3 and row['MeanLuscLuad'] >= 0.4:
            comp_counter += 1

        if comp_counter == 2:
            decpgs.append(row[0])
    
    decpgs.append('Label')
    
    train_degs = df_train.loc[df_train['CpGs'].isin(decpgs)]
    train_degs.to_csv('train_degs/CpGs_DE_train'+str(split)+'_p0-001_cov2.csv', index=False)

In [None]:
import os
os.mkdir('p-values-matrix')
os.mkdir('train_degs')



In [None]:
train_final = []
for i in range(len(list(data.columns))):
    resu = re.match('|'.join(train_cids),list(data.columns)[i])
    if resu:
        if resu.group(0) != '':
            train_final.append(i)

val_final = []
for i in range(len(list(data.columns))):
    resu = re.match('|'.join(val_cids),list(data.columns)[i])
    if resu:
        if resu.group(0) != '':
            val_final.append(i)

train_final.insert(0, 0)
val_final.insert(0, 0)

In [None]:
df_train = data.iloc[:,train_final]
df_val = data.iloc[:,val_final]

In [None]:
labels_train = df_train.iloc[-1,:].values

In [None]:
hlt_idx = np.where(labels_train == 'Healthy')[0]
luad_idx = np.where(labels_train == 'LUAD')[0]
lusc_idx = np.where(labels_train == 'LUSC')[0]

In [None]:
p_value_hltluad = []
p_value_hltlusc = []
p_value_luscluad = []
mean_dif_hltluad = []
mean_dif_hltlusc = []
mean_dif_luscluad = []
for _, row in tqdm(df_train.iterrows()):
    if row[0] == 'Label':
        continue
    comp_counter = 0
    
    hlt = row[hlt_idx].to_numpy(dtype='float')
    luad = row[luad_idx].to_numpy(dtype='float')
    lusc = row[lusc_idx].to_numpy(dtype='float')
    
    stats_hltluad = stats.ttest_ind(hlt,luad,equal_var=False)
    stats_hltlusc = stats.ttest_ind(hlt,lusc,equal_var=False)
    stats_luscluad = stats.ttest_ind(lusc,luad,equal_var=False)
    
    mean_dif_hltluad.append(np.abs(np.mean(hlt) - np.mean(luad)))
    mean_dif_hltlusc.append(np.abs(np.mean(hlt) - np.mean(lusc)))
    mean_dif_luscluad.append(np.abs(np.mean(lusc) - np.mean(luad)))
    
    p_value_hltluad.append(stats_hltluad.pvalue)
    p_value_hltlusc.append(stats_hltlusc.pvalue)
    p_value_luscluad.append(stats_luscluad.pvalue)

In [None]:
p_values_df = pd.DataFrame()
p_values_df['CpGs'] = df_train['CpGs'].values[:-1]
p_values_df['HltLusc'] = p_value_hltlusc
p_values_df['HltLuad'] = p_value_hltluad
p_values_df['LuscLuad'] = p_value_luscluad
p_values_df['MeanHltLuad'] = mean_dif_hltluad
p_values_df['MeanHltLusc'] = mean_dif_hltlusc
p_values_df['MeanLuscLuad'] = mean_dif_luscluad

p_values_df.to_csv('p_values_train0.csv', index=False)

In [None]:
p_values_df = pd.read_csv('p_values_train0.csv')

In [None]:
decpgs = []
for _, row in tqdm(p_values_df.iterrows()):
    comp_counter = 0
    if row['HltLuad'] <= 0.001/3 and row['MeanHltLuad'] >= 0.4:
            comp_counter += 1
    if row['HltLusc'] <= 0.001/3 and row['MeanHltLusc'] >= 0.4:
        comp_counter += 1
    if row['LuscLuad'] <= 0.001/3 and row['MeanLuscLuad'] >= 0.4:
        comp_counter += 1

    if comp_counter == 2:
        decpgs.append(row[0])

In [None]:
len(decpgs)

In [None]:
decpgs.append('Label')

In [None]:
train_degs = df_train.loc[data['CpGs'].isin(decpgs)]
train_degs.to_csv('CpGs_DE_train0_p0-001_cov2.csv', index=False)

In [None]:
new_train_degs = pd.DataFrame(columns = train_degs['CpGs'])

In [None]:
new_train_degs.columns

In [None]:
val_degs = df_val.loc[data['CpGs'].isin(decpgs)]
val_degs.to_csv('CpGs_DE_val0_p0-001_cov2.csv', index=False)

In [None]:
mrmrCpG = ["cg03555299", "cg08566455", "cg27649037", 
"cg06188545", "cg17283169", "cg14294859",
"cg18121066", "cg24597774", "cg03502002",
"cg11201447", "cg12222244", "cg00074145", 
"cg17510385", "cg16759976", "cg16404371", 
"cg00415665", "cg14557064", "cg25521254", 
"cg23746497", "cg27071152", "cg25115460"] 

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
classes = np.array(['Healthy', 'LUAD', 'LUSC'])

ohe = preprocessing.OneHotEncoder(sparse=False)
ohe.fit(classes.reshape(-1,1))

In [None]:
train_degs_T = train_degs.transpose()
train_degs_T.columns = train_degs_T.iloc[0]

In [None]:
val_degs_T = val_degs.transpose()
val_degs_T = val_degs_T.iloc[1:]

In [None]:
x_train, y_train = train_degs_T.loc[:,mrmrCpG[0:7]].values, train_degs_T.loc[:,'Label'].values

In [None]:
x_test, y_test = val_degs_T.loc[:,mrmrCpG[0:7]].values, val_degs_T.loc[:,'Label'].values

In [None]:
y_train_ohe = ohe.transform(y_train.reshape(-1,1))
y_test_ohe = ohe.transform(y_test.reshape(-1,1))

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7],
                        'C': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7]}]
clf = GridSearchCV(
                SVC(probability=True), tuned_parameters, scoring='accuracy'
            )

clf.fit(x_train, y_train_ohe.argmax(axis=1))
print(clf.best_params_)
best_params = clf.best_params_
train_preds = clf.predict(x_train)
corrects = np.sum(train_preds == y_train_ohe.argmax(axis=1))
train_acc = (corrects / x_train.shape[0]) * 100
print('kNN train acc: {}'.format(train_acc))

In [None]:
svm_ = SVC(**best_params)
#print(clf.best_params_)
test_preds = clf.predict(x_test)
corrects = np.sum(test_preds == y_test_ohe.argmax(axis=1))
test_acc = (corrects / x_test.shape[0]) * 100
print('kNN test acc: {}'.format(test_acc))

## Machine Learning Assesment

In [None]:
data_all = pd.read_csv('CpGs_450_matrix_nonans.csv.gz', compression='gzip')

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay, f1_score
import numpy as np
import pandas as pd

classes = np.array(['adeno', 'squa', 'healthy'])

ohe = preprocessing.OneHotEncoder(sparse=False)
ohe.fit(classes.reshape(-1,1))

In [None]:
ohe.transform(np.array(['adeno']).reshape(-1,1))

In [None]:
import os
os.mkdir('results_excels')

In [None]:
from openpyxl import load_workbook
path = 'Splits_10CV/'
train_accs = []
test_accs = []
train_F1 = []
test_F1 = []

n_cpgs = 6
writer_test = pd.ExcelWriter('results_excels/DNA-Methylation'+str(n_cpgs)+'CpGs_test.xlsx', engine='openpyxl') 
writer_train = pd.ExcelWriter('results_excels/DNA-Methylation'+str(n_cpgs)+'CpGs_train.xlsx', engine='openpyxl') 
all_labels = data_all.iloc[-1,:]
for split in range(10):
    print(10*'-')
    print('Split {}/{}'.format(split,10))
    print(10*'-')
    
    print('Data read...')
    data = pd.read_csv('mrmrCpGs/mrmrCpGs_LC_DNA_3classes_split'+str(split)+'.csv')
    train_f = open(path+'train_'+str(split)+'.txt', 'r')
    train_caseids = train_f.readlines()
    train_f.close()
    val_f = open(path+'val_'+str(split)+'.txt', 'r')
    val_caseids = val_f.readlines()
    val_f.close()

    train_cids = []
    for cid in train_caseids:
        train_cids.append(cid.replace('\n', ''))

    val_cids = []
    for cid in val_caseids:
        val_cids.append(cid.replace('\n', '')) 

    train_final = []
    for i in range(len(list(data['Case_IDs'].values))):
        resu = re.match('|'.join(train_cids),list(data['Case_IDs'].values)[i])
        if resu:
            if resu.group(0) != '':
                train_final.append(i)

    val_final = []
    for i in range(len(list(data['Case_IDs'].values))):
        resu = re.match('|'.join(val_cids),list(data['Case_IDs'].values)[i])
        if resu:
            if resu.group(0) != '':
                val_final.append(i)
    
    train_final.insert(0, 1)
    val_final.insert(0, 1)
    df_train = data.iloc[train_final,]
    df_val = data.iloc[val_final,]
    case_ids_val = df_val['Case_IDs']
    #val_df_all = data_all[case_ids_val]
    y_val = all_labels[case_ids_val].values
    y_val = np.where(y_val == 'Healthy', 'healthy', y_val)
    y_val = np.where(y_val == 'LUAD', 'adeno', y_val)
    y_val = np.where(y_val == 'LUSC', 'squa', y_val)
    
    case_ids_train = df_train['Case_IDs']
    #train_df_all = data_all[case_ids_train]
    y_train = all_labels[case_ids_train].values
    y_train = np.where(y_train == 'Healthy', 'healthy', y_train)
    y_train = np.where(y_train == 'LUAD', 'adeno', y_train)
    y_train = np.where(y_train == 'LUSC', 'squa', y_train)
    
    x_train = df_train.iloc[:,1:n_cpgs+1].values
    x_val = df_val.iloc[:,1:n_cpgs+1].values
    y_train_ohe = ohe.transform(y_train.reshape(-1,1))
    y_val_ohe = ohe.transform(y_val.reshape(-1,1))
    
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7],
                        'C': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7]}]
    clf = GridSearchCV(
                    SVC(probability=True), tuned_parameters, scoring='accuracy'
                )
    print('End data read...')
    
    #scaler = StandardScaler()
    #x_train = scaler.fit_transform(x_train)
    
    #### SVM TRAINING
    print('Svm training...')
    clf.fit(x_train, y_train_ohe.argmax(axis=1))
    print(clf.best_params_)
    best_params = clf.best_params_
    train_preds = clf.predict(x_train)
    corrects = np.sum(train_preds == y_train_ohe.argmax(axis=1))
    train_acc = (corrects / x_train.shape[0]) * 100
    train_f1 = f1_score(y_train_ohe.argmax(axis=1), train_preds, average='weighted')
    train_accs.append(train_acc)
    train_F1.append(train_f1*100)
    train_probs = clf.predict_proba(x_train)
    print('SVM train acc: {}'.format(train_acc))
    print('SVM train F1: {}'.format(train_f1))
    print('CM \n')
    print(confusion_matrix(y_train_ohe.argmax(axis=1), train_preds))
    
    #### SVM TEST
    svm_ = SVC(**best_params)
    #print(clf.best_params_)
    #x_val = scaler.transform(x_val)
    test_preds = clf.predict(x_val)
    corrects = np.sum(test_preds == y_val_ohe.argmax(axis=1))
    test_acc = (corrects / x_val.shape[0]) * 100
    test_f1 = f1_score(y_val_ohe.argmax(axis=1), test_preds, average='weighted')
    test_accs.append(test_acc)
    test_F1.append(test_f1*100)
    test_probs = clf.predict_proba(x_val)
    print('SVM test acc: {}'.format(test_acc))
    print('SVM test F1: {}'.format(test_f1))
    print('CM \n')
    print(confusion_matrix(y_val_ohe.argmax(axis=1), test_preds))
    
    #### SVM SAVE PREDS
    print("Saving SVM predictions... \n")
    
    sheet_name = 'split_'+str(split)
    
    data = pd.DataFrame()
    data['Case_Ids'] = case_ids_val
    data['Preds'] = test_preds
    data['Prob LUAD'] = test_probs[:, 0]
    data['Prob HLT'] = test_probs[:, 1]
    data['Prob LUSC'] = test_probs[:, 2]
    data['Real'] = y_val_ohe.argmax(axis=1)
    data.to_excel(writer_test, sheet_name = sheet_name)

    data = pd.DataFrame()
    data['Case_Ids'] = case_ids_train
    data['Preds'] = train_preds
    data['Prob LUAD'] = train_probs[:, 0]
    data['Prob HLT'] = train_probs[:, 1]
    data['Prob LUSC'] = train_probs[:, 2]
    data['Real'] = y_train_ohe.argmax(axis=1)
    data.to_excel(writer_train, sheet_name=sheet_name)

writer_train.close()
writer_test.close()

In [None]:
print('Mean Acc in train: {}+-{}'.format(np.mean(train_accs),np.std(train_accs)))
print('Mean F1 in train: {}+-{}'.format(np.mean(train_F1),np.std(train_F1)))
print(10*'-')
print('Mean Acc in test: {}+-{}'.format(np.mean(test_accs),np.std(test_accs)))
print('Mean F1 in test: {}+-{}'.format(np.mean(test_F1),np.std(test_F1)))

In [None]:
from collections import Counter
print(Counter(y_val))

## TSNE Visualization

In [None]:
import os
os.mkdir('plots')

In [None]:
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt
X_embedded = TSNE(n_components=2).fit_transform(x_val)

hlt = np.where(y_val == 'Healthy')[0]
luad = np.where(y_val == 'LUAD')[0]
lusc = np.where(y_val == 'LUSC')[0]

X_embedded_hlt = X_embedded[hlt,:]
X_embedded_luad = X_embedded[luad,:]
X_embedded_lusc = X_embedded[lusc,:]

plt.figure()
plt.scatter(X_embedded_hlt[:, 0], X_embedded_hlt[:, 1], c='blue', label='HLT')
plt.scatter(X_embedded_luad[:, 0], X_embedded_luad[:, 1], c='green', label='LUAD')
plt.scatter(X_embedded_lusc[:, 0], X_embedded_lusc[:, 1], c='red', label='LUSC')
plt.legend()
plt.savefig('plots/tsne_lastfold_cov2.png', format='png', dpi=300)
plt.show()

## Test optimal number of CpGs

In [None]:
from tqdm.notebook import trange
# All range of genes results
path = 'Splits_10CV/'


range_cpgs = 17
all_labels = data_all.iloc[-1,:]
global_train_accs = {'mean':[],'std':[]}
global_test_accs = {'mean':[],'std':[]}
global_train_f1 = {'mean':[],'std':[]}
global_test_f1 = {'mean':[],'std':[]}
for n_cpgs in trange(1,range_cpgs):
    train_accs = []
    test_accs = []
    train_F1 = []
    test_F1 = []
    for i in range(10):
        print(10*'-')
        print('Split {}/{}'.format(i,10))
        print(10*'-')

        print('Data read...')
        data = pd.read_csv('mrmrCpGs/mrmrCpGs_LC_DNA_3classes_split'+str(i)+'.csv')

        train_f = open(path+'train_'+str(i)+'.txt', 'r')
        train_caseids = train_f.readlines()
        train_f.close()
        val_f = open(path+'val_'+str(i)+'.txt', 'r')
        val_caseids = val_f.readlines()
        val_f.close()

        train_cids = []
        for cid in train_caseids:
            train_cids.append(cid.replace('\n', ''))

        val_cids = []
        for cid in val_caseids:
            val_cids.append(cid.replace('\n', '')) 

        train_final = []
        for i in range(len(list(data['Case_IDs'].values))):
            resu = re.match('|'.join(train_cids),list(data['Case_IDs'].values)[i])
            if resu:
                if resu.group(0) != '':
                    train_final.append(i)

        val_final = []
        for i in range(len(list(data['Case_IDs'].values))):
            resu = re.match('|'.join(val_cids),list(data['Case_IDs'].values)[i])
            if resu:
                if resu.group(0) != '':
                    val_final.append(i)

        #train_final.insert(0, 1)
        #val_final.insert(0, 1)
        df_train = data.iloc[train_final,]
        df_val = data.iloc[val_final,]

        case_ids_val = df_val['Case_IDs']
        #val_df_all = data_all[case_ids_val]
        y_val = all_labels[case_ids_val].values
        y_val = np.where(y_val == 'Healthy', 'healthy', y_val)
        y_val = np.where(y_val == 'LUAD', 'adeno', y_val)
        y_val = np.where(y_val == 'LUSC', 'squa', y_val)

        case_ids_train = df_train['Case_IDs']
        #train_df_all = data_all[case_ids_train]
        y_train = all_labels[case_ids_train].values
        y_train = np.where(y_train == 'Healthy', 'healthy', y_train)
        y_train = np.where(y_train == 'LUAD', 'adeno', y_train)
        y_train = np.where(y_train == 'LUSC', 'squa', y_train)

        x_train = df_train.iloc[:,1:n_cpgs+1].values
        x_val = df_val.iloc[:,1:n_cpgs+1].values
        y_train_ohe = ohe.transform(y_train.reshape(-1,1))
        y_val_ohe = ohe.transform(y_val.reshape(-1,1))
        print('End data read...')

        print('Svm training...')
        tuned_parameters = [{'kernel': ['rbf'], 'gamma': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7],
                            'C': [2**-7, 2**-5, 2**-2, 2, 2**4, 2**7]}]
        clf = GridSearchCV(
                        SVC(probability=True), tuned_parameters, scoring='accuracy'
                    )

        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        clf.fit(x_train, y_train_ohe.argmax(axis=1))
        print(clf.best_params_)
        best_params = clf.best_params_
        train_preds = clf.predict(x_train)
        corrects = np.sum(train_preds == y_train_ohe.argmax(axis=1))
        train_acc = (corrects / x_train.shape[0]) * 100
        train_f1 = f1_score(y_train_ohe.argmax(axis=1), train_preds, average='weighted', labels=[1, 0, 2])
        train_accs.append(train_acc)
        train_F1.append(train_f1*100)
        print('SVM train acc: {}'.format(train_acc))
        print('SVM train F1: {}'.format(train_f1))
        print('CM \n')
        print(confusion_matrix(y_train_ohe.argmax(axis=1), train_preds, labels=[1, 0, 2]))

        svm_ = SVC(**best_params)
        #print(clf.best_params_)
        x_val = scaler.transform(x_val)
        test_preds = clf.predict(x_val)
        corrects = np.sum(test_preds == y_val_ohe.argmax(axis=1))
        test_acc = (corrects / x_val.shape[0]) * 100
        test_f1 = f1_score(y_val_ohe.argmax(axis=1), test_preds, average='weighted', labels=[1, 0, 2])
        test_accs.append(test_acc)
        test_F1.append(test_f1*100)
        print('SVM test acc: {}'.format(test_acc))
        print('SVM test F1: {}'.format(test_f1))
        print('CM \n')
        print(confusion_matrix(y_val_ohe.argmax(axis=1), test_preds,labels=[1, 0, 2]))
    print('Mean Acc in train: {}+-{}'.format(np.mean(train_accs),np.std(train_accs)))
    print('Mean F1 in train: {}+-{}'.format(np.mean(train_F1),np.std(train_F1)))
    print(10*'-')
    print('Mean Acc in test: {}+-{}'.format(np.mean(test_accs),np.std(test_accs)))
    print('Mean F1 in test: {}+-{}'.format(np.mean(test_F1),np.std(test_F1)))
    
    global_train_accs['mean'].append(np.mean(train_accs))
    global_test_accs['mean'].append(np.mean(test_accs))
    global_train_f1['mean'].append(np.mean(train_F1))
    global_test_f1['mean'].append(np.mean(test_F1))
    global_train_accs['std'].append(np.std(train_accs))
    global_test_accs['std'].append(np.std(test_accs))
    global_train_f1['std'].append(np.std(train_F1))
    global_test_f1['std'].append(np.std(test_F1))

In [None]:
import matplotlib
import matplotlib.pyplot as plt

# Plot results for range of genes
fig, ax1 = plt.subplots()
plt.title('Accuracy DNA Methy')
color = 'tab:red'
ax1.set_xlabel('#CpGs')
ax1.set_ylabel('Accuracy')
ax1.plot(list(range(1,16)), global_test_accs['mean'], color="blue")
ax1.tick_params(axis='y')

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('std')  # we already handled the x-label with ax1
ax2.plot(list(range(1,16)), global_test_accs['std'], color='red')
ax2.tick_params(axis='y')
ax1.legend()
ax2.legend()
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig('plots/acc_range_dna_cov2.png', format='png', dpi=300)
plt.show()


plt.close()

# Plot results for range of genes
fig, ax1 = plt.subplots()
plt.title('F1 DNA Methy')
color = 'tab:red'
ax1.set_xlabel('#gCpGs')
ax1.set_ylabel('F1-Score')
ax1.plot(list(range(1,16)), global_test_f1['mean'], color="blue")
ax1.tick_params(axis='y')

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('std')  # we already handled the x-label with ax1
ax2.plot(list(range(1,16)), global_test_f1['std'], color='red')
ax2.tick_params(axis='y')
ax1.legend()
ax2.legend()
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig('plots/f1_range_dna_cov2.png', format='png', dpi=300)
plt.show()


plt.close()