# leo adults dataset: pre-processing for ml input

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from collections import OrderedDict
from sklearn.preprocessing import StandardScaler
import warnings
from joblib import dump, load
from pickle import dump, load
warnings.filterwarnings(action="ignore", message="^internal gelsd")
#np.random.seed(1)


In [2]:
os.getcwd()

'/Users/Ed/Documents/GitHub/sepsis_ml_omics_msc'

### Data Import

In [4]:
# import and check data

# alternative X matrices
normalisedCounts = pd.read_csv('dataset_leo/output/normalised_exp.csv') #, index_col=0)

# annotation and class labels
labels_data = pd.read_csv('dataset_leo/output/targets.csv', index_col=0)

# check
print('Normalised counts data shape:  '+str(normalisedCounts.shape))
normalisedCounts.head()

Normalised counts data shape:  (22011, 106)


Unnamed: 0.1,Unnamed: 0,GSM1914807,GSM1914808,GSM1914809,GSM1914810,GSM1914811,GSM1914812,GSM1914813,GSM1914814,GSM1914815,...,GSM1914902,GSM1914903,GSM1914904,GSM1914905,GSM1914906,GSM1914907,GSM1914908,GSM1914909,GSM1914910,GSM1914911
0,2315554,7.988658,8.437184,8.27716,8.279856,8.124447,7.888959,7.743694,7.94959,8.246015,...,8.175743,8.478647,8.522634,8.267206,8.245167,8.314857,8.206667,8.073364,8.160501,8.149364
1,2315633,7.604336,7.877958,7.89866,7.657952,7.865661,7.341377,7.465072,7.701582,7.645253,...,7.617291,7.846885,7.734627,7.72367,7.885184,7.709711,7.809246,7.773138,7.562412,7.7896
2,2315674,7.75603,8.334455,8.129139,7.949146,8.14026,7.552136,7.662208,7.943859,7.948027,...,7.94162,8.223983,8.157951,8.016826,7.979801,8.059808,8.09245,7.90982,8.057985,8.111045
3,2315739,7.742156,8.070674,7.978276,7.688884,8.048886,7.553671,7.618928,7.896778,7.711746,...,7.75297,7.872649,7.963431,7.957844,7.752404,7.978197,8.019349,7.808959,7.87947,7.684881
4,2315894,9.535384,10.219614,10.075132,9.614065,10.015771,9.765352,9.599523,10.154484,9.988529,...,10.007208,10.178184,10.283674,10.045307,9.725878,9.94512,10.107392,10.064496,9.860088,10.097237


### Pre-process X matrix

In [6]:
# pre-processing function for all the perth datasets types

def pre_process_X(raw_data):
    
    ''' dataframe -> datafram
    Input dataframe of normalised rnaseq data, transformed and z-scored
    '''
    # rename first column to ensembleGeneID and drop any duplicate rows based on this column
    df = raw_data.rename({"Unnamed: 0": "affy_probe_ID"}, axis=1).drop_duplicates(subset=["affy_probe_ID"], keep='first')
    
    # transpose the dataframe so genes are the columns
    df = df.transpose()
    
    # set the column names to first row, i.e ensembleGeneID, in the transposed frame
    df = df.rename(columns=df.iloc[0]).drop(df.index[0])
    
    # drop columns containing any NaN values
    nan_columns = df.columns[df.isna().any()].tolist()
    df = df.drop(nan_columns, axis=1)
    
    # z-score the data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    df = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)
    
    return df
    

In [7]:
# process each matrix and filter based on index

l_norm_X = pre_process_X(normalisedCounts)
l_norm_X.head()


Unnamed: 0,2315554.0,2315633.0,2315674.0,2315739.0,2315894.0,2315918.0,2315951.0,2316218.0,2316245.0,2316379.0,...,4133728.0,4134740.0,7385511.0,7385515.0,7385547.0,7385552.0,7385611.0,7385641.0,7385683.0,7385696.0
GSM1914807,-0.801101,-0.477595,-1.261832,-0.814735,-1.697615,0.766582,-1.37516,-0.517497,-1.036075,-0.917664,...,-1.193302,0.321372,-0.090157,0.617512,-0.645097,-0.851523,-1.100261,-0.583777,0.334289,-1.151476
GSM1914808,1.225424,1.175072,1.800243,1.299245,0.604285,1.402227,1.150008,1.162674,-1.181174,-0.849105,...,0.494821,-0.699605,1.423479,0.468561,0.082516,-0.001238,-1.229338,1.906317,-1.284613,-1.936364
GSM1914809,0.502405,1.300111,0.713339,0.70467,0.118215,1.387755,0.409928,1.446316,-0.059921,-0.200042,...,0.734641,0.214251,-0.02784,0.457613,-0.463707,-0.587461,-0.553096,1.469179,0.052834,-1.20024
GSM1914810,0.514587,-0.153755,-0.239512,-1.157534,-1.432917,1.173428,-1.339684,0.622747,-1.637608,-1.982493,...,0.344792,-0.812398,0.184977,0.633738,1.122524,-0.478908,-1.820808,0.285067,3.199836,-0.330872
GSM1914811,-0.18758,1.100799,0.77221,1.159036,-0.081487,-1.336768,1.360797,-0.846084,-0.19367,0.307112,...,1.175015,0.732782,0.310884,-1.773219,-0.204938,1.053665,1.811451,0.039464,-1.424448,0.255503


In [9]:
l_norm_X.shape

(105, 22011)

In [10]:
# get check output dimensions
print('Shape of dataset:   '+str(l_norm_X.shape))
print('Number of Missing values in dataset:   '+str(l_norm_X.isnull().sum().sum()))
print('The number of features with all zeros is:   '+str(sum((l_norm_X == 0).all(axis=0))))
print('Check data and annotation indices match:   '+str(sum(l_norm_X.index != labels_data.index)))

Shape of dataset:   (105, 22011)
Number of Missing values in dataset:   0
The number of features with all zeros is:   0
Check data and annotation indices match:   0


### Class Labels

In [13]:
# filter labels data to cross ref
labels_data

# create labels vector
labels_list = list(zip(labels_data.loc[:,'sampleID'], labels_data.loc[:,'condition']))
labels = []
for item in labels_list:
    if item[1] == 'Sepsis':
        labels.append(1)
    else:
        labels.append(0)
labels = np.asarray(labels)
labels_data['class'] = labels

### Ensembl gene codes filter: affymetrix -> ensemble

In [30]:
## import raw gene code mappings from biomaRt
gcMap = pd.read_csv('dataset_leo/affy_huex_1_0_st_v2_ensembl_mapping.csv').drop(['description', 'gene_biotype'], axis=1)
#gcMap = gcMap.rename(columns={'affy_huex_1_0_st_v2': 'Probe_Id', 'ensembl_gene_id':'ensemblGeneID', 'external_gene_name':'geneName'})
#gcMap.head()

In [31]:
gcMap

Unnamed: 0,affy_huex_1_0_st_v2,ensembl_gene_id,external_gene_name,chromosome_name
0,2467201,ENSG00000282052,RPS7P11,CHR_HSCHR17_2_CTG5
1,2467205,ENSG00000282052,RPS7P11,CHR_HSCHR17_2_CTG5
2,2467189,ENSG00000282052,RPS7P11,CHR_HSCHR17_2_CTG5
3,2467191,ENSG00000282052,RPS7P11,CHR_HSCHR17_2_CTG5
4,2523630,ENSG00000232733,RPL12P1,CHR_HSCHR6_MHC_SSTO_CTG1
...,...,...,...,...
3313,3807490,ENSG00000244716,,1
3314,3807500,ENSG00000244716,,1
3315,3807499,ENSG00000236768,,3
3316,3807488,ENSG00000236768,,3


In [21]:
# examine the gene code mappings file
print('Number of unique probes in affymetrix dataset:                '+str(len(l_norm_X.columns.unique())))
print('Number of lines of mappings in the mapping dataset:           '+str(len(gcMap['ensemblGeneID'])))
print('Number of unique affymetrix probes in the dataset             '+str(len(gcMap['Probe_Id'].unique())))
print('Number of unique ensemble gene_Ids mapped to affymetrix probe:'+str(len(gcMap['ensemblGeneID'].unique())))
print('Number of unique gene_names mapped to affymetrix probe:       '+str(len(gcMap['geneName'].unique())))

Number of unique probes in affymetrix dataset:                22011
Number of lines of mappings in the mapping dataset:           3318
Number of unique affymetrix probes in the dataset             1853
Number of unique ensemble gene_Ids mapped to affymetrix probe:696
Number of unique gene_names mapped to affymetrix probe:       524


In [24]:
## filter out all probes that map to ensembel gene IDs not on the main chromosome (filter out Haplotyptic regions)
## this doesn't reduce the number of genes in the annotation, but reduces duplicate ensembl IDs mapped to the same gene
## source: https://www.researchgate.net/post/How-to-deal-with-multiple-ensemble-IDs-mapping-to-one-gene-symbol-in-a-RNA-Seq-dataset

filtered_gcMap = gcMap[~gcMap['chromosome_name'].str.contains('CHR_')]


print('Number of mappings in filtered list:                     '+str(len(filtered_gcMap)))
print('Number of unique affymetrix probes mapped to ensembl id: '+str(len(filtered_gcMap['Probe_Id'].unique())))
print('Number of unique ensemble gene_Ids mapped to affymetrix: '+str(len(filtered_gcMap['ensemblGeneID'].unique())))
print('Number of unique gene_names:                             '+str(len(filtered_gcMap['geneName'].unique())))

# get index of unique affymetrix probes to filter dataset by
affy_index = filtered_gcMap['Probe_Id'].unique().tolist()
len(affy_index)


Number of mappings in filtered list:                     2959
Number of unique affymetrix probes mapped to ensembl id: 1853
Number of unique ensemble gene_Ids mapped to affymetrix: 651
Number of unique gene_names:                             524


1853

In [25]:
# the main dataset needs to be filtered down from the 55k to 42k probes, excluding the 11k or so probes
# that are not mapped to an ensemble ID

l_filtered_X = l_norm_X.loc[:,affy_index].transpose()
print('Shape of the filtered dataset:                     '+str(l_filtered_X.shape))

# create dictionary of mapping between gene names to rename dataframes for analysis and visualisation
# this sets the index of the dataframe to the probe_id, then pulls out the mappings - leaves out probes with no mapping
mapping_dict_affy_en = pd.Series(filtered_gcMap.set_index('Probe_Id')['ensemblGeneID'].to_dict())

# map the new column to the dataframe
l_filtered_X['ensemblGeneID'] = mapping_dict_affy_en
first_column = l_filtered_X.pop('ensemblGeneID')
  
## insert column using insert(position,column_name, first_column) function
## take maximum expression values for duplicate ensembl_ID
l_filtered_X.insert(0, 'ensemblGeneID', first_column)
l_filtered_X = l_filtered_X.groupby('ensemblGeneID').max().sort_values('ensemblGeneID')
l_filtered_X = l_filtered_X.transpose()

print('Shape of the filtered and deduplicated dataset:    '+str(l_filtered_X.shape))


Shape of the filtered dataset:                     (1853, 105)
Shape of the filtered and deduplicated dataset:    (105, 219)


### Save to csv

In [29]:
# saving pre-processed datasets

l_filtered_X.to_csv('dataset_leo/ml_inputs/l_norm_X.csv')
labels_data.to_csv('dataset_leo/ml_inputs/l_annotation.csv')
np.savetxt('dataset_leo/ml_inputs/l_y.csv', labels, delimiter=',')