# pearth neonatal dataset: pre-processing for ml input

### Import Libraries

In [75]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from collections import OrderedDict
from sklearn.preprocessing import StandardScaler
import warnings
from joblib import dump, load
from pickle import dump, load
warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.filterwarnings(action="ignore", message="Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.")
#np.random.seed(1)


In [3]:
os.getcwd()

'/Users/Ed/Documents/GitHub/sepsis_ml_omics_msc'

### Data Import

In [81]:
# import and check data

normalisedCounts = pd.read_csv('dataset_pearth/an0304/output/normalised_counts.csv') #, index_col=0)
vstNormalisedCounts = pd.read_csv('dataset_pearth/an0304/output/vst_transform.csv') #, index_col=0)
annotation = pd.read_csv('dataset_pearth/an0304/output/annotation.csv', index_col=0)
labels_data = pd.read_csv('dataset_pearth/an0304/resources/targets.csv', index_col=0)
print('Normalised counts data shape:  '+str(normalisedCounts.shape))
print('vst counts data shape:  '+str(vstNormalisedCounts.shape))
normalisedCounts.head()

Normalised counts data shape:  (60251, 101)
vst counts data shape:  (60251, 101)


Unnamed: 0.1,Unnamed: 0,10005_sepsis,10006_NOLOS,10009_sepsis,10015_NOLOS,10017_NOLOS,10018_sepsis,10021_NOLOS,10022_sepsis,10029_NOLOS,...,10501_Index,10501_NOLOS,10506_Index,10506_sepsis,10509_Index,10509_NOLOS,10510_Index,10514_Index,10515_Index,10517_NOLOS
0,ENSG00000000003,13.718303,20.014954,5.99982,13.357729,3.549098,5.292374,19.763132,13.186977,1.713466,...,10.885164,11.766278,16.609504,9.207933,5.498076,17.021216,6.89169,33.035014,26.139871,7.548518
1,ENSG00000000005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.586448,0.0,...,0.0,0.0,1.277654,0.0,2.749038,0.362154,0.0,0.0,0.0,1.006469
2,ENSG00000000419,244.969689,245.600166,205.993832,211.052112,209.39678,238.15682,216.955269,235.481738,193.621672,...,214.478037,227.651909,181.426895,233.651308,335.382662,187.595531,242.501345,214.066891,191.09147,208.339102
3,ENSG00000000457,503.657681,407.804689,653.980419,508.929459,498.056749,522.621912,426.005286,269.391108,419.799199,...,448.307475,477.301643,389.684528,494.926417,503.073993,410.682109,477.249539,347.528348,411.026936,468.008127
4,ENSG00000000460,263.587386,240.596427,321.990359,466.184728,248.436858,283.141998,313.135844,244.901007,289.575774,...,303.97827,288.018033,269.585034,253.218167,228.170172,231.416109,289.450984,278.815518,306.467452,242.055816


### Pre-process X matrix

In [83]:
# pre-processing function for all the perth datasets types

def pre_process_X(raw_data):
    
    ''' dataframe -> datafram
    Input dataframe of normalised rnaseq data, transformed and z-scored
    '''
    # rename first column to ensembleGeneID and drop any duplicate rows based on this column
    df = raw_data.rename({"Unnamed: 0": "ensemblGeneID"}, axis=1).drop_duplicates(subset=["ensemblGeneID"], keep='first')
    
    # transpose the dataframe so genes are the columns
    df = df.transpose()
    
    # set the column names to first row, i.e ensembleGeneID, in the transposed frame
    df = df.rename(columns=df.iloc[0]).drop(df.index[0])
    
    # drop columns containing any NaN values
    nan_columns = df.columns[df.isna().any()].tolist()
    df = df.drop(nan_columns, axis=1)
    
    # z-score the data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    X_df = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)

    print('Shapre of dataset:   '+str(df.shape)))
    print('Number of Missing values in dataset:   '+str(df.isnull().sum().sum()))
    print('The number of features with all zeros is:   '+str(sum((df == 0).all(axis=0))))
    print('Check data and annotation indices match:   '+str(sum(df.index == labels_data.index)))
    
    return X_df
    

In [64]:
# reformat data, drop duplicates, transpose, index on probe_id, drop columns with all NaN, drop Fold change

# drop duplicate rows based on Probe_Id, and fold change column
df = normalisedCounts.rename({"Unnamed: 0": "ensemblGeneID"}, axis=1).drop_duplicates(subset=["ensemblGeneID"], keep='first')

# transpose data set and index on probe ID, drop all columns that are all NaN values
df = df.transpose()
df = df.rename(columns=df.iloc[0]).drop(df.index[0])

# identify any columns that are all NaN values, and drop from both data df and gene_df
nan_columns = df.columns[df.isna().any()].tolist()
df = df.drop(nan_columns, axis=1)


In [65]:
# run checks
print('Number of features in dataset:   '+str(len(df.columns)))
print('Number of examples in dataset:  '+str(len(df)))
print('Number of Missing values in dataset:   '+str(df.isnull().sum().sum()))
print('The number of features with all zeros is:   '+str(sum((df == 0).all(axis=0))))
print('Check data and annotation indices match:   '+str(sum(df.index == labels_data.index)))


Number of features in dataset:   60251
Number of examples in dataset:  100
Number of Missing values in dataset:   0
The number of features with all zeros is:   0
Check data and annotation indices match:   100


In [78]:
# create labels for both data sets

labels = []
for item in df.index:
    if 'sepsis' in item:
        labels.append(1)
    else:
        labels.append(0)
labels = np.asarray(labels)

# check correct
len(list(zip(df.index, labels)))
len(labels)

100

### Standardisation (z-scoring)


In [69]:
# standardise by imputing NaN values and using standard scale - z-scoring
# note - given imputer not necessary, could move standard scaler into the pipeline for each model

def standardise(examples):
    scaler = StandardScaler()
    examples_scaled = scaler.fit_transform(examples)
    return examples_scaled

X_df = standardise(df)
X_df = pd.DataFrame(X_df, columns=df.columns, index=df.index)
print(X_df.shape)

# Reduced dataset for code testing
X_df_red = X_df.iloc[:, 0:200]
print(X_df_red.shape)

(100, 60251)
(100, 200)


In [90]:
test_df = pre_process_X(normalisedCounts)
test_df.head()

Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000288393,ENSG00000288398,ENSG00000288434,ENSG00000288436,ENSG00000288437,ENSG00000288459,ENSG00000288460,ENSG00000288473,ENSG00000288484,ENSG00000288520
10005_sepsis,0.312256,-0.424177,0.904139,0.748036,0.031825,1.077545,0.21053,-0.39173,-0.567503,1.358795,...,0.880659,-0.0829,-0.184828,-0.723412,-0.367454,-0.366654,-0.392751,-1.082149,-0.270972,1.521119
10006_NOLOS,1.376115,-0.424177,0.919018,-0.124258,-0.340821,0.162808,0.062274,0.013081,-0.233716,-0.057961,...,-0.834456,-0.216122,-0.184828,0.115687,-0.367454,-0.366654,-0.392751,0.052146,-0.270972,0.697217
10009_sepsis,-0.991831,-0.424177,-0.015676,2.116023,0.978441,3.822378,1.578077,0.756023,-0.22756,2.119577,...,1.693574,-0.361442,-0.184828,-1.918433,-0.145369,-0.366654,-0.392751,-2.244922,-0.270972,0.661122
10015_NOLOS,0.251335,-0.424177,0.103697,0.796011,3.315594,-1.209882,-0.738915,-1.186947,0.132331,-0.693583,...,-1.530431,-0.27716,-0.184828,-1.646924,-0.367454,-0.366654,-0.392751,1.452351,-0.270972,-0.242633
10017_NOLOS,-1.405896,-0.424177,0.064632,0.697065,-0.21374,0.450401,-0.673035,0.438527,-0.44043,0.158331,...,0.734843,-0.505576,-0.184828,-0.475653,-0.236083,-0.366654,-0.392751,-0.898623,-0.270972,0.831847


In [89]:
# check that all values are true - checking between the function and manual calculation
(test_df == X_df)[(test_df == X_df)==False].count().sum()

0

### Save to csv

In [71]:
# saving pre-processed datasets

X_df.to_csv('dataset_pearth/an0304/ml_inputs/pearth_X.csv')
annotation.to_csv('dataset_pearth/an0304/ml_inputs/pearth_annotation.csv')
np.savetxt('dataset_pearth/an0304/ml_inputs/pearth_y.csv', labels, delimiter=',')