# Machine Learning for disease diagnosis using omics data

This file and raw data in github repo here: https://github.com/parkyed/sepsis_ml_omics_msc


# Code:

In [20]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from collections import OrderedDict
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.model_selection import KFold, RepeatedKFold
import warnings
from joblib import dump, load
from pickle import dump, load
warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.filterwarnings(action="ignore", message="Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.")
#np.random.seed(1)


In [21]:
os.getcwd()

'/Users/Ed/Documents/GitHub/sepsis_ml_omics_msc'

### Data Import

In [22]:
# import and check data

raw_data = pd.read_csv('genomic_data.csv')
print('The number of columns is:  '+str(len(raw_data.columns)))
print('The number of row is:  '+str(len(raw_data)))

The number of columns is:  93
The number of row is:  48804


### Checking for Duplicates

In [11]:
# Counting the number of duplicate rows using various columns, NaN values excluded before looking for duplicates.

dup_column = ['NuID', 'Search_Key', 'ILMN_Gene', 'RefSeq_ID', 'Entrez_Gene_ID', 'Probe_Id']
for column in dup_column:
    data = raw_data.dropna(subset=[column])
    dup_list = []
    for index, value in data.duplicated(subset=[column]).items():
        if value == True:
            dup_list.append(index)
    print(f"# duplicate rows based on {column}:   "+str(len(dup_list)))

# duplicate rows based on NuID:   0
# duplicate rows based on Search_Key:   4671
# duplicate rows based on ILMN_Gene:   11000
# duplicate rows based on RefSeq_ID:   5742
# duplicate rows based on Entrez_Gene_ID:   10897
# duplicate rows based on Probe_Id:   1


In [29]:
raw_data.tail(10)

Unnamed: 0,NuID,Species,Source,Search_Key,Transcript,ILMN_Gene,Source_Reference_ID,RefSeq_ID,Unigene_ID,Entrez_Gene_ID,...,Inf_149,Inf_152,Inf_157,Inf_159a,Inf_162a,Inf_164,Inf_191,Inf_198,Inf_203,Fold change
48794,ZZQvowA1A0rdICPUkk,Homo sapiens,RefSeq,ILMN_24212,ILMN_24212,CASP8AP2,NM_012115.2,NM_012115.2,,9994.0,...,7.0826,7.1113,7.0518,7.0041,7.016,6.9972,7.1369,7.0338,7.0494,
48795,ZzTi0SkIUngIr1kHu0,Homo sapiens,RefSeq,ILMN_137958,ILMN_37701,C20ORF7,NM_024120.3,NM_024120.3,,79133.0,...,7.8497,7.7296,7.5873,7.4077,7.4651,7.4088,7.6038,7.5458,7.5341,
48796,ZzTla5Up6pZQklKEU4,Homo sapiens,RefSeq,ILMN_27978,ILMN_27978,TMEM151A,NM_153266.2,NM_153266.2,,256472.0,...,7.0177,6.9288,7.007,6.919,6.9754,6.9154,7.013,7.0644,6.9691,
48797,ZZTZrt5dKGKbmWEeks,Homo sapiens,RefSeq,ILMN_31400,ILMN_31400,LOC645781,XM_933141.2,XM_933141.2,,645781.0,...,7.1248,7.0515,7.0518,7.1918,7.0641,7.0524,6.9758,7.0188,7.0494,
48798,ZztzUERXUExGBETFFU,Homo sapiens,RefSeq,ILMN_14674,ILMN_14674,NPY6R,NR_002713.1,NR_002713.1,,,...,7.001,7.0157,6.9841,6.9733,7.0813,7.0127,6.9264,7.0644,6.9361,
48799,ZzVW43OPaE6GO1WJqs,Homo sapiens,RefSeq,ILMN_139314,ILMN_139314,SEZ6L2,XM_939114.1,XM_939114.1,,26470.0,...,7.1036,7.0593,7.0298,7.177,7.1478,7.0254,7.0245,6.9904,6.9919,
48800,ZZXgKFKE5q5rp53.p0,Homo sapiens,RefSeq,ILMN_42654,ILMN_42654,LOC339047,XM_932631.1,XM_932631.1,,339047.0,...,7.0509,7.0305,6.9952,7.0171,7.0033,7.0127,7.1234,7.0338,7.038,
48801,ZzXXie6icAfD693B6w,Homo sapiens,Unigene,ILMN_104503,ILMN_104503,HS.539415,Hs.539415,,Hs.539415,,...,7.0423,7.0072,7.0298,7.0441,7.016,7.0254,7.0245,7.1131,7.0494,
48802,ZzZ5OziA4Anaop_fcU,Homo sapiens,RefSeq,ILMN_14103,ILMN_165051,MAGI1,NM_015520.1,NM_015520.1,,9223.0,...,7.0423,7.0593,7.16,7.0308,7.0294,7.0672,7.0606,7.0644,6.9804,
48803,,Homo sapiens,RefSeq,ILMN_8135,ILMN_8135,TUBB2A,NM_001069.2,NM_001069.2,,7280.0,...,,,,,,,,,,


### Drop duplicate rows on Probe_Id and transpose

In [63]:
# reformat data, drop duplicates, transpose, index on probe_id, drop columns with all NaN, drop Fold change

# drop duplicate rows based on Probe_Id, and fold change column
df = raw_data.drop_duplicates(subset=['Probe_Id'], keep='first')
df = df.drop(['Fold change'], axis=1)

# split into gene code dataframe, and data set based on unique probes
gene_df = df.iloc[:, 4:15]
df = df.iloc[:, np.r_[14, 29:92]]

# transpose data set and index on probe ID, drop all columns that are all NaN values
df = df.transpose()
df = df.rename(columns=df.iloc[0]).drop(df.index[0])
nan_columns = df.columns[df.isna().any()].tolist()
#df = df.dropna(axis=1, how='all')
df = df.drop(nan_columns, axis=1)
df.columns

gene_df = gene_df[gene_df['Probe_Id'] != nan_columns[0]]
gene_df

Unnamed: 0,Transcript,ILMN_Gene,Source_Reference_ID,RefSeq_ID,Unigene_ID,Entrez_Gene_ID,GI,Accession,Symbol,Protein_Product,Probe_Id
0,ILMN_23902,PCDHB2,NM_018936.2,NM_018936.2,,56133.0,14195608,NM_018936.2,PCDHB2,NP_061759.1,ILMN_2227757
1,ILMN_309061,DCDC2B,NM_001099434.1,NM_001099434.1,,149069.0,150456470,NM_001099434.1,DCDC2B,NP_001092904.1,ILMN_1683690
2,ILMN_35279,LOC649978,XM_939064.1,XM_939064.1,,649978.0,89030602,XM_939064.1,LOC649978,XP_944157.1,ILMN_1710146
3,ILMN_85446,HS.302418,Hs.302418,,Hs.302418,,8628444,BE165723,,,ILMN_1822171
4,ILMN_26906,GIMAP4,NM_018326.2,NM_018326.2,,55303.0,28416432,NM_018326.2,GIMAP4,NP_060796.1,ILMN_1748473
...,...,...,...,...,...,...,...,...,...,...,...
48798,ILMN_14674,NPY6R,NR_002713.1,NR_002713.1,,,84871997,NR_002713.1,NPY6R,,ILMN_2065273
48799,ILMN_139314,SEZ6L2,XM_939114.1,XM_939114.1,,26470.0,89041080,XM_939114.1,SEZ6L2,XP_944207.1,ILMN_1800873
48800,ILMN_42654,LOC339047,XM_932631.1,XM_932631.1,,339047.0,89039940,XM_932631.1,LOC339047,XP_937724.1,ILMN_1666642
48801,ILMN_104503,HS.539415,Hs.539415,,Hs.539415,,2266088,AA525160,,,ILMN_1908209


In [65]:
nan_columns[0] in gene_df.columns

False

In [44]:

# drop null value rows
gene_df = raw_data.iloc[:, 0:29]
df = raw_data.iloc[:, np.r_[14, 29:93]].drop_duplicates(subset=['Probe_Id'], keep='first')
df = df.transpose()
df = df.rename(columns=df.iloc[0]).drop(df.index[0])
df = df.drop(['Fold change'], axis=0)
df = df.dropna(axis=1, how='all')

labels = []
for item in df.index:
    if 'Con' in item:
        labels.append(0)
    else:
        labels.append(1)
labels = np.asarray(labels)

bool_series_zeros = (df == 0).all(axis=0)

print('Features:  '+str(len(df.columns)))
print('Examples:  '+str(len(df)))
print('Missing values:   '+str(df.isnull().sum().sum()))
print('The number of columns with all zeros is:   '+str(sum(bool_series_zeros)))
df.head()

KeyboardInterrupt: 

In [26]:
# create copy of the data without patient Inf_075

df_no75 = df.drop(['Inf075'], axis=0)

# re-create labels
labels_no75 = []
for item in df.index:
    if item == 'Inf075':
        continue
    if 'Con' in item:
        labels_no75.append(0)
    else:
        labels_no75.append(1)
labels_no75 = np.asarray(labels_no75)
len(labels_no75)

# check it is correct
#list(zip(df_no75.index, labels_no75))
df_no75.shape

(62, 48802)

### Pre-Processing

1. **Standardisation** - Data standardised (i.e. z-scored) for the following reasons:
- L1 regularisation penalties in logistic regressions assumes data centred at zero and on the same scale
- Distance based ML models such as SVM require and assume standardised data, otherwise variables on larger scales disproportionally impact the model
- Am using the output coeffiecients from logistic regression as a crude measure of feature importance, and so in order to compare coefficients as a measure of relative importance, variables must be standardised

background - https://towardsdatascience.com/normalization-vs-standardization-quantitative-analysis-a91e8a79cebf

2. **Train, test, split:** 66% train / 33% test

Ref: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html


In [27]:
# standardise by imputing NaN values and using standard scale - z-scoring
# note - given imputer not necessary, could move standard scaler into the pipeline for each model

def standardise(examples):
    scaler = StandardScaler()
    examples_scaled = scaler.fit_transform(examples)
    return examples_scaled

X_df = standardise(df)
X_df = pd.DataFrame(X_df, columns=df.columns, index=df.index)
print(X_df.shape)

# dataset excluding patient 75
X_df_no75 = standardise(df_no75)
X_df_no75 = pd.DataFrame(X_df_no75, columns=df_no75.columns, index=df_no75.index)
print(X_df_no75.shape)

# Reduced dataset for code testing
X_df_red = X_df.iloc[:, 0:200]
print(X_df_red.shape)

(63, 48802)
(62, 48802)
(63, 200)


In [28]:
# retriving saved results:
import pickle

with open('neonatal_preprocessed.pkl', 'wb') as f:
    pickle.dump([gene_df, X_df, X_df_no75, X_df_red, labels, labels_no75], f)

### De-duplicating based on ensembl_gene_id

NOTE: NOT YET DONE, MAY NEED TO GET THE IDS USING R

In [12]:
# identifying duplicate rows, and elimnating and replacing with max expression levels, across examples.

# drop duplicate 
test_column = 'ILMN_Gene'
full_data = raw_data.drop_duplicates(subset=['Probe_Id'], keep='first')

# get boolean series of duplicates based on test_column name
dd_data = full_data.duplicated(subset=[test_column], keep=False)

# get index of the duplicates - i.e. the ones where True
dup_index = dd_data[dd_data].index

# create new dataframe, dropping all rows in the duplicate index
clean_df = full_data.drop(index=dup_index)

clean_df.sort_values(by=['ILMN_Gene'])
# test to see if there are any further duplicate rows based on that test column
#clean_dups = clean_df.duplicated(subset=['ILMN_Gene'], keep='first')
#print(clean_dups)
#clean_dups[clean_dups].index

Unnamed: 0,NuID,Species,Source,Search_Key,Transcript,ILMN_Gene,Source_Reference_ID,RefSeq_ID,Unigene_ID,Entrez_Gene_ID,...,Inf_149,Inf_152,Inf_157,Inf_159a,Inf_162a,Inf_164,Inf_191,Inf_198,Inf_203,Fold change
25103,Ku8QhfS0n_hIOABXuE,Homo sapiens,RefSeq,ILMN_5579,ILMN_183371,7A5,NM_182762.2,NM_182762.2,,346389.0,...,7.0267,6.9027,6.9288,6.9657,7.0408,6.9972,6.9992,6.9750,7.0849,
29049,NlLnpDrjjo8PUQKCJI,Homo sapiens,RefSeq,ILMN_19974,ILMN_19974,A26B1,NM_207355.2,NM_207355.2,,339010.0,...,7.0348,6.9117,7.0070,7.0041,7.1023,7.0822,7.0845,7.0491,7.0978,
42813,Wje_nHHp916e8Akjro,Homo sapiens,RefSeq,ILMN_31305,ILMN_307810,A26C1B,NM_001099771.1,NM_001099771.1,,728378.0,...,7.0592,7.0157,7.0641,6.9264,7.0590,7.1122,7.0724,7.1493,7.0153,
47175,ZfoDn4IUQp4Auf56LU,Homo sapiens,RefSeq,ILMN_8023,ILMN_8023,A2M,NM_000014.4,NM_000014.4,,2.0,...,6.9565,7.0456,7.0070,7.0510,6.9608,7.0524,6.9637,6.9195,7.1239,
7232,97DZJV_gpQpKINl0Ec,Homo sapiens,RefSeq,ILMN_17375,ILMN_17375,A2ML1,NM_144670.2,NM_144670.2,,144568.0,...,7.0741,7.1184,7.0750,7.0510,6.9889,6.9835,7.0724,7.0491,7.0849,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31612,opLIq5393IogH5SiEI,Homo sapiens,RefSeq,ILMN_24644,ILMN_24644,ZUFSP,NM_145062.1,NM_145062.1,,221302.0,...,8.1171,7.7784,7.7894,8.1040,8.1978,7.8296,7.7331,7.8357,7.8272,
252,05KVyf6Big_oj1KLSo,Homo sapiens,RefSeq,ILMN_22056,ILMN_22056,ZW10,NM_004724.2,NM_004724.2,,9183.0,...,7.9831,8.1483,7.9581,7.2240,7.5738,7.8844,7.7331,7.6729,7.5341,-0.3903
15920,EVdUtxU1JfTj09e7Qo,Homo sapiens,RefSeq,ILMN_4827,ILMN_4827,ZXDB,NM_007157.3,NM_007157.3,,158586.0,...,7.1934,7.1242,7.2487,7.0580,7.0641,7.1840,7.1504,7.3298,7.1376,
24628,KoL5XxK6hf_7w3v91U,Homo sapiens,RefSeq,ILMN_16610,ILMN_16610,ZYG11B,NM_024646.1,NM_024646.1,,79699.0,...,8.0346,8.6077,8.3911,8.4578,8.6710,8.2491,8.1199,8.1083,8.6916,


In [13]:
clean_dups = clean_df.dropna(subset=['Transcript']).duplicated(subset=['Transcript'], keep=False)
dup_index_2 = clean_dups[clean_dups].index
clean_df_2 = clean_df.drop(index=dup_index_2)
clean_df_2

Unnamed: 0,NuID,Species,Source,Search_Key,Transcript,ILMN_Gene,Source_Reference_ID,RefSeq_ID,Unigene_ID,Entrez_Gene_ID,...,Inf_149,Inf_152,Inf_157,Inf_159a,Inf_162a,Inf_164,Inf_191,Inf_198,Inf_203,Fold change
1,0_9dKJSRdZ6nRKsR5o,Homo sapiens,RefSeq,ILMN_42795,ILMN_309061,DCDC2B,NM_001099434.1,NM_001099434.1,,149069.0,...,7.0826,7.0923,7.1350,6.9426,7.0748,6.9416,7.0845,7.0644,7.1239,0.0169
2,0_9Zd7UXqV4FRJ5XdE,Homo sapiens,RefSeq,ILMN_35279,ILMN_35279,LOC649978,XM_939064.1,XM_939064.1,,649978.0,...,7.0010,7.1420,7.1856,7.1013,7.1069,7.0127,7.0489,7.1493,7.1517,0.0317
3,0_CoxIt3fv09UVwOo0,Homo sapiens,Unigene,ILMN_85446,ILMN_85446,HS.302418,Hs.302418,,Hs.302418,,...,7.0177,7.0861,7.0184,7.0380,6.9952,6.9835,7.0963,7.0961,7.0978,-0.0050
4,0_DHz7_krTURH3lVHk,Homo sapiens,RefSeq,ILMN_26906,ILMN_26906,GIMAP4,NM_018326.2,NM_018326.2,,55303.0,...,12.1522,11.0532,12.8357,12.8616,11.9632,12.6279,10.6621,12.0104,12.0981,-0.6525
5,0_dJiKAH3o6kidUsOo,Homo sapiens,Unigene,ILMN_135414,ILMN_135414,HS.583233,Hs.583233,,Hs.583233,,...,7.0665,6.9750,6.9514,7.0041,6.9754,7.1466,6.9992,6.9608,6.9240,-0.0128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48791,ZzoXZ08Fvp.Kjq6aYk,Homo sapiens,Unigene,ILMN_127410,ILMN_127410,HS.575229,Hs.575229,,Hs.575229,,...,6.9830,7.1989,7.1112,7.6210,7.5831,7.1840,7.2971,7.1131,7.0041,
48792,ZzQeInuQpqo7l66ISI,Homo sapiens,Unigene,ILMN_106946,ILMN_106946,HS.543340,Hs.543340,,Hs.543340,,...,7.0348,7.1355,7.0298,7.0171,7.1800,7.0672,7.1234,6.9608,7.0733,
48794,ZZQvowA1A0rdICPUkk,Homo sapiens,RefSeq,ILMN_24212,ILMN_24212,CASP8AP2,NM_012115.2,NM_012115.2,,9994.0,...,7.0826,7.1113,7.0518,7.0041,7.0160,6.9972,7.1369,7.0338,7.0494,
48798,ZztzUERXUExGBETFFU,Homo sapiens,RefSeq,ILMN_14674,ILMN_14674,NPY6R,NR_002713.1,NR_002713.1,,,...,7.0010,7.0157,6.9841,6.9733,7.0813,7.0127,6.9264,7.0644,6.9361,
