# pearth neonatal dataset: pre-processing for ml input

### Import Libraries

In [96]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from collections import OrderedDict
from sklearn.preprocessing import StandardScaler
import warnings
from joblib import dump, load
from pickle import dump, load
warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.filterwarnings(action="ignore", message="Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.")
#np.random.seed(1)


In [97]:
os.getcwd()

'/Users/Ed/Documents/GitHub/sepsis_ml_omics_msc'

### Data Import

In [110]:
# import and check data

# alternative X matrices
normalisedCounts = pd.read_csv('dataset_pearth/an0304/output/normalised_counts.csv') #, index_col=0)
vstNormalisedCounts = pd.read_csv('dataset_pearth/an0304/output/vst_transform.csv') #, index_col=0)
fpkmNormalisedCounts = pd.read_csv('dataset_pearth/an0304/output/fpkm_normalised_counts.csv') #, index_col=0)

# annotation and class labels
annotation = pd.read_csv('dataset_pearth/an0304/output/annotation.csv', index_col=0)
labels_data = pd.read_csv('dataset_pearth/an0304/resources/targets.csv', index_col=0)

# check
print('Normalised counts data shape:  '+str(normalisedCounts.shape))
print('vst counts data shape:  '+str(vstNormalisedCounts.shape))
print('fpkm counts data shape:  '+str(fpkmNormalisedCounts.shape))
normalisedCounts.head()

Normalised counts data shape:  (60251, 101)
vst counts data shape:  (60251, 101)
fpkm counts data shape:  (60251, 101)


Unnamed: 0.1,Unnamed: 0,10005_sepsis,10006_NOLOS,10009_sepsis,10015_NOLOS,10017_NOLOS,10018_sepsis,10021_NOLOS,10022_sepsis,10029_NOLOS,...,10501_Index,10501_NOLOS,10506_Index,10506_sepsis,10509_Index,10509_NOLOS,10510_Index,10514_Index,10515_Index,10517_NOLOS
0,ENSG00000000003,13.718303,20.014954,5.99982,13.357729,3.549098,5.292374,19.763132,13.186977,1.713466,...,10.885164,11.766278,16.609504,9.207933,5.498076,17.021216,6.89169,33.035014,26.139871,7.548518
1,ENSG00000000005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.586448,0.0,...,0.0,0.0,1.277654,0.0,2.749038,0.362154,0.0,0.0,0.0,1.006469
2,ENSG00000000419,244.969689,245.600166,205.993832,211.052112,209.39678,238.15682,216.955269,235.481738,193.621672,...,214.478037,227.651909,181.426895,233.651308,335.382662,187.595531,242.501345,214.066891,191.09147,208.339102
3,ENSG00000000457,503.657681,407.804689,653.980419,508.929459,498.056749,522.621912,426.005286,269.391108,419.799199,...,448.307475,477.301643,389.684528,494.926417,503.073993,410.682109,477.249539,347.528348,411.026936,468.008127
4,ENSG00000000460,263.587386,240.596427,321.990359,466.184728,248.436858,283.141998,313.135844,244.901007,289.575774,...,303.97827,288.018033,269.585034,253.218167,228.170172,231.416109,289.450984,278.815518,306.467452,242.055816


In [137]:
# import class groupings
targets = pd.read_csv('dataset_pearth/an0304/resources/targets.csv', index_col=0)

# create index based on analysisID for the examples in the sepsis or control groups. (P-LOS and contaminant excluded)
index_test = targets.index[targets['sampleGroupML2'].isin(['sepsis', 'control'])]

# filter the targets file by the index
targets_filtered = targets.loc[index_test,:]
print('The number of examples in the filtered data set is:    '+str(len(targets_filtered['sampleGroupML2'].tolist())))

The number of examples in the filtered data set is:    78


### Pre-process X matrix

In [154]:
# pre-processing function for all the perth datasets types

def pre_process_X(raw_data):
    
    ''' dataframe -> datafram
    Input dataframe of normalised rnaseq data, transformed and z-scored
    '''
    # rename first column to ensembleGeneID and drop any duplicate rows based on this column
    df = raw_data.rename({"Unnamed: 0": "ensemblGeneID"}, axis=1).drop_duplicates(subset=["ensemblGeneID"], keep='first')
    
    # transpose the dataframe so genes are the columns
    df = df.transpose()
    
    # set the column names to first row, i.e ensembleGeneID, in the transposed frame
    df = df.rename(columns=df.iloc[0]).drop(df.index[0])
    
    # drop columns containing any NaN values
    nan_columns = df.columns[df.isna().any()].tolist()
    df = df.drop(nan_columns, axis=1)
    
    # z-score the data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    df = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)
    
    return df
    

In [155]:
# process each matrix and filter based on index

p_norm_X = pre_process_X(normalisedCounts).loc[index_test,:]
p_vst_X = pre_process_X(vstNormalisedCounts).loc[index_test,:]
p_fpkm_X = pre_process_X(fpkmNormalisedCounts).loc[index_test,:]

### Class Labels

In [156]:
# filter labels data to cross ref
labels_data = labels_data.loc[index_test,:]

# create labels vector
labels = []
for item in p_norm_X.index:
    if 'sepsis' in item:
        labels.append(1)
    else:
        labels.append(0)
labels = np.asarray(labels)

# check correct
# list(zip(labels_data['sampleGroupML'], p_norm_X.index, labels))

In [157]:
# get check output dimensions
print('Shapre of dataset:   '+str(p_norm_X.shape))
print('Number of Missing values in dataset:   '+str(p_norm_X.isnull().sum().sum()))
print('The number of features with all zeros is:   '+str(sum((p_norm_X == 0).all(axis=0))))
print('Check data and annotation indices match:   '+str(sum(p_norm_X.index == labels_data.index)))

Shapre of dataset:   (78, 60251)
Number of Missing values in dataset:   0
The number of features with all zeros is:   0
Check data and annotation indices match:   78


### Save to csv

In [158]:
# saving pre-processed datasets

p_norm_X.to_csv('dataset_pearth/an0304/ml_inputs/p_norm_X.csv')
p_vst_X.to_csv('dataset_pearth/an0304/ml_inputs/p_vst_X.csv')
p_fpkm_X.to_csv('dataset_pearth/an0304/ml_inputs/p_fpkm_X.csv')
annotation.to_csv('dataset_pearth/an0304/ml_inputs/pearth_annotation.csv')
np.savetxt('dataset_pearth/an0304/ml_inputs/pearth_y.csv', labels, delimiter=',')

In [103]:
# function to check whether two data frames are the same. Number of matches false should equal zero
#(df1 == df2)[(df1 == df2)==False].count().sum()