# wong paediatrics dataset: pre-processing for ml input

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from collections import OrderedDict
from sklearn.preprocessing import StandardScaler
import warnings
from joblib import dump, load
from pickle import dump, load
warnings.filterwarnings(action="ignore", message="^internal gelsd")
#np.random.seed(1)


In [2]:
os.getcwd()

'/Users/Ed/Documents/GitHub/sepsis_ml_omics_msc'

### Data Import

In [3]:
# import and check data

# alternative X matrices
normalisedCounts = pd.read_csv('dataset_wong/output/normalised_exp.csv') #, index_col=0)

# annotation and class labels
labels_data = pd.read_csv('dataset_wong/output/targets.csv', index_col=0)

# check
print('Normalised counts data shape:  '+str(normalisedCounts.shape))
normalisedCounts.head()

Normalised counts data shape:  (54675, 247)


Unnamed: 0.1,Unnamed: 0,GSM1617492,GSM1617493,GSM1617494,GSM1617495,GSM1617496,GSM1617497,GSM1617498,GSM1617499,GSM1617500,...,GSM1617758,GSM1617759,GSM1617760,GSM1617761,GSM1617762,GSM1617763,GSM1617764,GSM1617765,GSM1617766,GSM1617767
0,1007_s_at,6.480428,6.309763,6.178332,6.429684,6.70496,6.761714,6.590785,6.393995,6.170426,...,7.102116,7.05578,6.913683,6.489448,6.500215,6.429381,6.992181,6.392452,6.490935,7.173261
1,1053_at,5.957581,5.910509,6.429563,6.074588,5.733254,5.748208,6.249178,6.096216,5.609574,...,5.93311,6.206179,5.930328,5.185324,6.302903,6.67098,6.201271,5.341256,6.615973,5.692392
2,117_at,8.76728,9.219351,9.609292,8.779422,7.482476,7.32936,8.40274,9.652488,8.365256,...,7.133309,8.227304,8.230635,7.256653,9.514109,9.240488,10.481245,8.877384,10.170809,8.534553
3,121_at,8.68531,8.267696,7.746195,7.751903,8.007167,8.148816,8.346299,8.203552,8.146709,...,7.942987,8.239259,8.305233,9.005083,8.087161,7.800295,7.995118,8.518748,7.800516,8.712825
4,1255_g_at,2.949759,3.009085,2.660869,2.950493,2.843097,3.19967,2.8916,2.896296,2.872295,...,2.920644,3.024307,3.092109,3.293336,2.85414,2.898836,2.930668,3.019821,2.850628,2.988359


### Pre-process X matrix

In [4]:
# pre-processing function for all the perth datasets types

def pre_process_X(raw_data):
    
    ''' dataframe -> datafram
    Input dataframe of normalised rnaseq data, transformed and z-scored
    '''
    # rename first column to ensembleGeneID and drop any duplicate rows based on this column
    df = raw_data.rename({"Unnamed: 0": "affy_probe_ID"}, axis=1).drop_duplicates(subset=["affy_probe_ID"], keep='first')
    
    # transpose the dataframe so genes are the columns
    df = df.transpose()
    
    # set the column names to first row, i.e ensembleGeneID, in the transposed frame
    df = df.rename(columns=df.iloc[0]).drop(df.index[0])
    
    # drop columns containing any NaN values
    nan_columns = df.columns[df.isna().any()].tolist()
    df = df.drop(nan_columns, axis=1)
    
    # z-score the data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    df = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)
    
    return df
    

In [53]:
# process each matrix and filter based on index

w_norm_X = pre_process_X(normalisedCounts)
w_norm_X.head()


Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
GSM1617492,0.204287,-0.646879,-0.18521,1.77549,-0.158048,-0.232908,-0.052714,1.228258,0.727841,0.097294,...,0.237008,0.239686,-0.364037,-0.223004,-1.106239,-0.497183,1.715007,-0.785625,0.694494,-0.074895
GSM1617493,-0.261806,-0.74275,0.286141,0.590625,0.279996,0.351596,-0.364469,-0.098497,1.154687,0.307041,...,0.376046,0.412585,-0.712371,-0.135464,-1.203162,0.400214,-0.091309,-0.053911,-0.411598,-0.683423
GSM1617494,-0.620747,0.314393,0.692711,-0.888989,-2.291128,0.792257,-1.311134,1.319395,1.314919,0.44523,...,0.574173,0.683378,0.381488,0.429228,-1.583988,-0.869088,-0.026533,-0.042388,-0.980302,0.781058
GSM1617495,0.065702,-0.408575,-0.172549,-0.872795,-0.152624,0.793982,-1.166853,0.30592,1.145665,-0.269316,...,-0.319668,-0.410791,-0.857264,-0.84827,-0.180219,-0.653681,-0.827999,0.392818,-0.53685,0.191827
GSM1617496,0.817492,-1.103759,-1.524803,-0.148555,-0.945609,-0.340107,-0.41909,0.661588,0.228307,-0.060534,...,1.436831,1.801459,1.046046,1.234849,-0.132124,-0.566261,-1.193691,-1.172549,-0.3447,1.201937


In [8]:
w_norm_X.shape

(246, 54675)

In [55]:
# get check output dimensions
print('Shape of dataset:   '+str(w_norm_X.shape))
print('Number of Missing values in dataset:   '+str(w_norm_X.isnull().sum().sum()))
print('The number of features with all zeros is:   '+str(sum((w_norm_X == 0).all(axis=0))))
print('Check data and annotation indices match:   '+str(sum(w_norm_X.index != labels_data.index)))

Shape of dataset:   (246, 54675)
Number of Missing values in dataset:   0
The number of features with all zeros is:   0
Check data and annotation indices match:   0


### Class Labels

In [135]:
# filter labels data to cross ref
labels_data

# create labels vector
labels_list = list(zip(labels_data.loc[:,'sampleID'], labels_data.loc[:,'condition']))
labels = []
for item in labels_list:
    if item[1] == 'Control':
        labels.append(0)
    else:
        labels.append(1)
labels = np.asarray(labels)
labels_data['class'] = labels

### Ensembl gene codes filter: affymetrix -> ensemble

In [79]:
## import raw gene code mappings from biomaRt
gcMap = pd.read_csv('dataset_wong/Affy_HG-U133_Plus_2_ensembl_mapping.csv').drop(['description', 'gene_biotype'], axis=1)
gcMap = gcMap.rename(columns={'affy_hg_u133_plus_2': 'Probe_Id'})
gcMap.head()

Unnamed: 0,Probe_Id,ensembl_gene_id,external_gene_name,chromosome_name
0,1553551_s_at,ENSG00000198888,MT-ND1,MT
1,1553551_s_at,ENSG00000210100,MT-TI,MT
2,1553551_s_at,ENSG00000210112,MT-TM,MT
3,1553551_s_at,ENSG00000198763,MT-ND2,MT
4,1553538_s_at,ENSG00000198804,MT-CO1,MT


In [99]:
# examine the gene code mappings file
print('Number of unique probes in affymetrix dataset:                '+str(len(w_norm_X.columns.unique())))
print('Number of lines of mappings in the mapping dataset:           '+str(len(gcMap['ensembl_gene_id'])))
print('Number of unique affymetrix probes in the dataset             '+str(len(gcMap['Probe_Id'].unique())))
print('Number of unique ensemble gene_Ids mapped to affymetrix probe:'+str(len(gcMap['ensembl_gene_id'].unique())))
print('Number of unique gene_names mapped to affymetrix probe:       '+str(len(gcMap['external_gene_name'].unique())))

Number of unique probes in affymetrix dataset:                54675
Number of lines of mappings in the mapping dataset:           52755
Number of unique affymetrix probes in the dataset             42839
Number of unique ensemble gene_Ids mapped to affymetrix probe:28215
Number of unique gene_names mapped to affymetrix probe:       22084


In [87]:
## filter out all probes that map to ensembel gene IDs not on the main chromosome (filter out Haplotyptic regions)
## this doesn't reduce the number of genes in the annotation, but reduces duplicate ensembl IDs mapped to the same gene
## source: https://www.researchgate.net/post/How-to-deal-with-multiple-ensemble-IDs-mapping-to-one-gene-symbol-in-a-RNA-Seq-dataset

filtered_gcMap = gcMap[~gcMap['chromosome_name'].str.contains('CHR_')]


print('Number of mappings in filtered list:                     '+str(len(filtered_gcMap)))
print('Number of unique affymetrix probes mapped to ensembl id: '+str(len(filtered_gcMap['Probe_Id'].unique())))
print('Number of unique ensemble gene_Ids mapped to affymetrix: '+str(len(filtered_gcMap['ensembl_gene_id'].unique())))
print('Number of unique gene_names:                             '+str(len(filtered_gcMap['external_gene_name'].unique())))

# get index of unique affymetrix probes to filter dataset by
affy_index = filtered_gcMap['Probe_Id'].unique().tolist()
len(affy_index)


Number of mappings in filtered list:                     48006
Number of unique affymetrix Probe_Ids:                   42806
Number of unique ensemble gene_Ids mapped to affymetrix: 25358
Number of unique gene_names:                             22066


In [139]:
# the main dataset needs to be filtered down from the 55k to 42k probes, excluding the 11k or so probes
# that are not mapped to an ensemble ID

w_filtered_X = w_norm_X.loc[:,affy_index].transpose()
print('Shape of the filtered dataset:                     '+str(w_filtered_X.shape))

# create dictionary of mapping between gene names to rename dataframes for analysis and visualisation
# this sets the index of the dataframe to the probe_id, then pulls out the mappings - leaves out probes with no mapping
mapping_dict_affy_en = pd.Series(filtered_gcMap.set_index('Probe_Id')['ensembl_gene_id'].to_dict())

# map the new column to the dataframe
w_filtered_X['ensembl_ID'] = mapping_dict_affy_en
first_column = w_filtered_X.pop('ensembl_ID')
  
## insert column using insert(position,column_name, first_column) function
## take maximum expression values for duplicate ensembl_ID
w_filtered_X.insert(0, 'ensembl_ID', first_column)
w_filtered_X = w_filtered_X.groupby('ensembl_ID').max().sort_values('ensembl_ID')
w_filtered_X = w_filtered_X.transpose()

print('Shape of the filtered and deduplicated dataset:    '+str(w_filtered_X.shape))


Shape of the filtered dataset:                     (42806, 246)
Shape of the filtered and deduplicated dataset:    (246, 22966)


### Save to csv

In [141]:
# saving pre-processed datasets

w_filtered_X.to_csv('dataset_wong/ml_inputs/w_norm_X.csv')
labels_data.to_csv('dataset_wong/ml_inputs/w_annotation.csv')
np.savetxt('dataset_wong/ml_inputs/w_y.csv', labels, delimiter=',')

### sirs v sepsis

In [143]:
# import and check data
normalisedCounts_sirs = pd.read_csv('dataset_wong/output/normalised_exp_sirs.csv') #, index_col=0)
labels_data_sirs = pd.read_csv('dataset_wong/output/targets_sirs.csv', index_col=0)


In [145]:
# preprocess X
w_norm_X_sirs = pre_process_X(normalisedCounts_sirs)
w_filtered_X_sirs = w_norm_X_sirs.loc[:,affy_index].transpose()
print('Shape of the filtered dataset:                     '+str(w_filtered_X_sirs.shape))

# map the new column to the dataframe
w_filtered_X_sirs['ensembl_ID'] = mapping_dict_affy_en
first_column = w_filtered_X_sirs.pop('ensembl_ID')
w_filtered_X_sirs.insert(0, 'ensembl_ID', first_column)
w_filtered_X_sirs = w_filtered_X_sirs.groupby('ensembl_ID').max().sort_values('ensembl_ID')
w_filtered_X_sirs = w_filtered_X_sirs.transpose()
print('Shape of the filtered and deduplicated dataset:    '+str(w_filtered_X_sirs.shape))


Shape of the filtered dataset:                     (42806, 229)
Shape of the filtered and deduplicated dataset:    (229, 22966)


In [146]:
# process y
labels_list_sirs = list(zip(labels_data_sirs.loc[:,'sampleID'], labels_data_sirs.loc[:,'condition']))
labels_sirs = []
for item in labels_list_sirs:
    if item[1] == 'SIRS':
        labels_sirs.append(0)
    else:
        labels_sirs.append(1)
labels_sirs = np.asarray(labels_sirs)
labels_data_sirs['class'] = labels_sirs

In [147]:
# save to csv
w_filtered_X_sirs.to_csv('dataset_wong/ml_inputs/w_norm_sirs_X.csv')
np.savetxt('dataset_wong/ml_inputs/w_sirs_y.csv', labels_sirs, delimiter=',')