In [None]:
import random
import numpy as np
import scipy as sp
import collections
import pandas as pd
from pylab import *
import seaborn as sns
import os
from matplotlib import pyplot as plt
from IPython.display import clear_output

sns.set_style('white')
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1500)

#set for reproducibility
os.chdir('.\data')

In [None]:
def load_metabolomics(filename):
    # loading in TB plasma metabolomics data from tab-delimted file to pandas dataframe
    df = pd.read_csv(filename)#, sep='\t', lineterminator='\r') # loading data
    df = df.rename(columns={df.columns.values[0]: 'metabolite_name'})
    #df['metabolite_name'] = df['metabolite_name'].str.strip('\n') # getting rid of line-terminator
    
    df = df.transpose()
    df.columns = df.iloc[0, :]
    df = df.iloc[1:, :]#df.index.name = 'local_sample_id'
    df.index.name = 'sample_id' #= df.rename(columns={'metabolite_name': 'local_sample_id'})
    #df = df.rename_axis(None)
    
    return df

In [None]:
def impute(df, thresh=0.1):
    #drop columns with proportion missing values > threshold
    null_allowed = len(df.index) * thresh
    null_columns = df.columns.values[df.isnull().sum() > null_allowed]
    df = df.drop(columns=null_columns) 
    #impute remaining nans with minimum value
    df = df.apply(lambda x: x.fillna(x.min()), axis=0)
    return df.dropna(axis=1)

In [None]:
def load_patientmetadata(filename, m_df):
    # reading in patient metadata
    p_df = pd.read_csv(filename)
    p_df.columns = p_df.columns.str.lower()
    p_df = p_df.set_index('sample_id')
    #drop redundant columns
    p_df = p_df.drop(columns=[p_df.columns.values[0], 'id'])
    #join with full dataset
    m_df = m_df.set_index('sample_id').join(p_df)

    return m_df.reset_index(), p_df.reset_index()

In [None]:
# standardizing data by making values (features) zero-mean and unit-variance
def standardize_data(f_vals):
    from sklearn import preprocessing

    # applying standardization 
    standardizerScaler = preprocessing.StandardScaler()
    data_StandardScaled = standardizerScaler.fit_transform(f_vals)
    
    return data_StandardScaled

In [None]:
def make_df(f_vals, features, l_vals, labels):
    df = pd.concat([pd.DataFrame(data=l_vals, columns=labels), 
                    pd.DataFrame(data=f_vals, columns=features)], axis=1)
    return df

In [None]:
def perform_PCA(data, l_vals, labels, save=False, ncomp=10):
# computing principal components
    from sklearn import decomposition

    pcaAbs = decomposition.PCA(n_components=ncomp)
    data_PCA = pcaAbs.fit_transform(data)
    
    pc_cols = ['PC ' + str(i) for i in np.arange(1, ncomp + 1)]
    df_PCA = make_df(data_PCA, pc_cols, l_vals, labels)
    
    #Plot explained variance by number of components
    var_exp = pcaAbs.explained_variance_ratio_
    fig_ve, ax_ve = plt.subplots(1, 1)
    sns.lineplot(x=(np.arange(len(var_exp)) + 1), y=np.cumsum(var_exp), ax=ax_ve)
    plt.xlabel('PCA component number')
    plt.ylabel('Cumulative variance ratio')
    if save:
        plt.savefig('variance-exp.png', bbox_inches='tight', pad_inches=0.5)
    
    fig_pca, ax_pca = plt.subplots(1, 1)
    sns.scatterplot(x='PC 1', y='PC 2', data=df_PCA, hue='group', ax=ax_pca)
    
    return df_PCA

In [None]:
def significanceTest(ctrl, case):
    _, p_normal_ctrl = sp.stats.normaltest(ctrl, nan_policy='omit')
    _, p_normal_case = sp.stats.normaltest(case, nan_policy='omit')
    #print(metab, p_normal_ctrl, p_normal_case)
    
    if (p_normal_ctrl < alpha_normal and p_normal_case < alpha_normal):
        _, p_var = sp.stats.bartlett(ctrl, case)
        _, p_diff = sp.stats.ttest_ind(ctrl, case, nan_policy='omit', equal_var=(p_var < alpha_normal))
    else:
        _, p_diff = sp.stats.ranksums(ctrl, case)
    
    return p_diff

In [None]:
all_files = ['measurements_plasma_full.csv', 'measurements_serum_full.csv', 'measurements_plasmarpmi_full.csv']
all_df = []
for file in all_files:
    temp_df = impute(load_metabolomics(file))
    index = temp_df.index
    temp_df = pd.DataFrame(data=standardize_data(temp_df), columns=temp_df.columns)
    temp_df.index = index
    all_df.append(temp_df)
    
full_df   = pd.concat(all_df, sort=False).reset_index()
full_df, patient_df = load_patientmetadata('full_unblinded_metadata_with_smoking_tst.csv', full_df)

labels = list(patient_df)#['group', 'mb_sample_id', 'timepoint', 'region', 'gender']
features = [x for x in full_df.columns.to_list() if x not in labels]
f_vals = full_df.loc[:, features].values
l_vals = full_df.loc[:, labels].values

# displaying shape and first few data entries
print('The shape of our data matrix is: ', full_df.shape)

In [None]:
full_df.head()

In [None]:
metabolites = list(metabolomicsTB_df.columns)[:-1]
numMetabas = metabolomicsTB_df.shape[1] - 1
alpha = 0.05

significant = []
for metab in metabolites:
    indicesControl = metabolomicsTB_df['group'] == 'control'
    indicesCase = metabolomicsTB_df['group'] == 'case'
    
    metabAverageControl = list(metabolomicsTB_df.loc[indicesControl, metab])
    metabAverageCase = list(metabolomicsTB_df.loc[indicesCase, metab])
    stat, p_val = sp.stats.ranksums(metabAverageControl, metabAverageCase)
    
    if p_val  < alpha / numMetabas:
        significant.append(metab)

In [None]:
significant

In [None]:
#Do Spearman correlation for healthy vs. disease
display(metabolomicsTB_df.sort_values(by=['timepoint', 'group']))
#df.corr(method='spearman')

In [None]:
ut_met_summary = ut_mz.groupby(['group']).agg([('Mean', np.nanmean), ('Std', np.nanstd)])
display(ut_met_summary)

In [None]:
plasma_df = full_df[full_df['sample_type'].str.contains('plasma')]
plasma_df.head()
metabolites = [x for x in plasma_df.columns.to_list() if x not in labels]
metabolites = metabolites[:-1]
#print(metabolites)
alpha_diff = 0.01
alpha_normal = 0.05
significant = []
for metab in metabolites:
    indicesControl = np.array(plasma_df['group'] == 'control') * np.array(plasma_df['timepoint'] != 'BL')
    indicesCase = np.array(plasma_df['group'] == 'case') * np.array(plasma_df['timepoint'] != 'BL')
    
    
    metabAverageControl = list(plasma_df.loc[indicesControl, metab])
    metabAverageCase = list(plasma_df.loc[indicesCase, metab])
    
    p_diff = significanceTest(metabAverageControl, metabAverageCase)
    if p_diff  < alpha_diff:
        significant.append(metab)
        
#print(significant)
significant = pd.DataFrame({'metabolite' : significant})
display(significant)

#significant.to_csv('significant_metabolite.csv')

In [None]:
display(plasma_df)