# edinburgh neonatal pre-processing

This file and raw data in github repo here: https://github.com/parkyed/sepsis_ml_omics_msc


### Import libraries

In [177]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from collections import OrderedDict
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import warnings
from joblib import dump, load
from pickle import dump, load
warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.filterwarnings(action="ignore", message="Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.")
#np.random.seed(1)

In [21]:
os.getcwd()

'/Users/Ed/Documents/GitHub/sepsis_ml_omics_msc'

### Data Import

In [123]:
# import and check data

raw_data = pd.read_csv('dataset_edinburgh/genomic_data.csv')
print('The number of columns is:  '+str(len(raw_data.columns)))
print('The number of row is:  '+str(len(raw_data)))

The number of columns is:  93
The number of row is:  48804


### Checking for Duplicates

In [124]:
# Counting the number of duplicate rows using various columns, NaN values excluded before looking for duplicates.

dup_column = ['NuID', 'Search_Key', 'ILMN_Gene', 'RefSeq_ID', 'Entrez_Gene_ID', 'Probe_Id']
for column in dup_column:
    data = raw_data.dropna(subset=[column])
    dup_list = []
    for index, value in data.duplicated(subset=[column]).items():
        if value == True:
            dup_list.append(index)
    print(f"# duplicate rows based on {column}:   "+str(len(dup_list)))

# duplicate rows based on NuID:   0
# duplicate rows based on Search_Key:   4671
# duplicate rows based on ILMN_Gene:   11000
# duplicate rows based on RefSeq_ID:   5742
# duplicate rows based on Entrez_Gene_ID:   10897
# duplicate rows based on Probe_Id:   1


### Drop duplicate rows on Probe_Id and transpose

In [154]:
# reformat data, drop duplicates, transpose, index on probe_id, drop columns with all NaN, drop Fold change

# drop duplicate rows based on Probe_Id, and fold change column
df = raw_data.drop_duplicates(subset=['Probe_Id'], keep='first')
df = df.drop(['Fold change'], axis=1)

# split into gene code dataframe, and data set based on unique probes
gene_df = df.iloc[:, np.r_[14, 5]]
df = df.iloc[:, np.r_[14, 29:92]]

# transpose data set and index on probe ID, drop all columns that are all NaN values
df = df.transpose()
df = df.rename(columns=df.iloc[0]).drop(df.index[0])

# identify any columns that are all NaN values, and drop from both data df and gene_df
nan_columns = df.columns[df.isna().any()].tolist()
df = df.drop(nan_columns, axis=1)
gene_df = gene_df[gene_df['Probe_Id'] != nan_columns[0]]
print('Number of features in dataset:   '+str(len(df.columns)))
print('Number of genes in the gene name database:    '+str(len(gene_df.index)))
print('Number of examples in dataset:  '+str(len(df)))
print('Number of Missing values in dataset:   '+str(df.isnull().sum().sum()))
print('The number of features with all zeros is:   '+str(sum((df == 0).all(axis=0))))
print('Check indices of df and gene_df match. sum of matching index:   '+str(sum(df.columns == gene_df['Probe_Id'])))

# create copy of the data without patient Inf_075
df_no75 = df.drop(['Inf075'], axis=0)

Number of features in dataset:   48802
Number of genes in the gene name database:    48802
Number of examples in dataset:  63
Number of Missing values in dataset:   0
The number of features with all zeros is:   0
Check indices of df and gene_df match. sum of matching index:   48802


### Create labels vectors

In [84]:
# create labels for both data sets

labels = []
for item in df.index:
    if 'Con' in item:
        labels.append(0)
    else:
        labels.append(1)
labels = np.asarray(labels)

labels_no75 = []
for item in df.index:
    if item == 'Inf075':
        continue
    if 'Con' in item:
        labels_no75.append(0)
    else:
        labels_no75.append(1)
labels_no75 = np.asarray(labels_no75)

# check correct
#list(zip(df.index, labels))
#list(zip(df_no75.index, labels_no75))

### Standardisation (z-scoring)

Data standardised (i.e. z-scored) for the following reasons:
- L1 regularisation penalties in logistic regressions assumes data centred at zero and on the same scale
- Distance based ML models such as SVM require and assume standardised data, otherwise variables on larger scales disproportionally impact the model
- Am using the output coeffiecients from logistic regression as a crude measure of feature importance, and so in order to compare coefficients as a measure of relative importance, variables must be standardised

Reference: https://towardsdatascience.com/normalization-vs-standardization-quantitative-analysis-a91e8a79cebf

In [88]:
# standardise by imputing NaN values and using standard scale - z-scoring
# note - given imputer not necessary, could move standard scaler into the pipeline for each model

def standardise(examples):
    scaler = StandardScaler()
    examples_scaled = scaler.fit_transform(examples)
    return examples_scaled

X_df = standardise(df)
X_df = pd.DataFrame(X_df, columns=df.columns, index=df.index)
print(X_df.shape)

# dataset excluding patient 75
X_df_no75 = standardise(df_no75)
X_df_no75 = pd.DataFrame(X_df_no75, columns=df_no75.columns, index=df_no75.index)
print(X_df_no75.shape)

# Reduced dataset for code testing
X_df_red = X_df.iloc[:, 0:200]
print(X_df_red.shape)

(63, 48802)
(62, 48802)
(63, 200)


In [101]:
# saving pre-processed datasets
import pickle

with open('neonatal_preprocessed.pkl', 'wb') as f:
    pickle.dump([gene_df, X_df, X_df_no75, X_df_red, labels, labels_no75], f)
    
X_df_no75.to_csv('neonatal_data_processed.csv')
gene_df.to_csv('gene_codes_df.csv')
np.savetxt('labels_no75.csv', labels_no75, delimiter=',')

### Import ensembl gene codes merge

In [150]:
gene_code_mappings = pd.read_csv('dataset_edinburgh/illuminaht12v3_ensembl_mapping.csv').drop(['description', 'gene_biotype'], axis=1)


In [160]:
gene_code_mappings = gene_code_mappings.rename(columns={'illumina_humanht_12_v3': 'Probe_Id'})

In [166]:
len(gene_code_mappings)

44170

In [172]:
gene_df_merged = pd.merge(gene_df, gene_code_mappings, how='left', on='Probe_Id')

In [173]:
len(gene_df_merged)

58740

In [174]:
gene_df_merged.head(30)

Unnamed: 0,Probe_Id,ILMN_Gene,ensembl_gene_id,external_gene_name
0,ILMN_2227757,PCDHB2,ENSG00000112852,PCDHB2
1,ILMN_1683690,DCDC2B,ENSG00000222046,DCDC2B
2,ILMN_1710146,LOC649978,,
3,ILMN_1822171,HS.302418,,
4,ILMN_1748473,GIMAP4,ENSG00000133574,GIMAP4
5,ILMN_1822843,HS.583233,,
6,ILMN_1710170,PPAP2C,ENSG00000141934,PLPP2
7,ILMN_1704497,KRT25,ENSG00000204897,KRT25
8,ILMN_1847638,HS.579719,,
9,ILMN_1772260,STEAP1,ENSG00000164647,STEAP1


In [175]:
## check that all probes in the raw data set remain in the merged gene annotation

len(gene_df_merged['Probe_Id'].unique())

48802

In [176]:
## saved merged gene dataset
gene_df_merged.to_csv('gene_codes_merged.csv')
