In [None]:
import random
import numpy as np
import scipy as sp
import collections
import pandas as pd
from pylab import *
import seaborn as sns
import os
from matplotlib import pyplot as plt
from IPython.display import clear_output
import statsmodels.stats.multitest as multi

sns.set_style('white')
pd.options.display.float_format = '{:,.7f}'.format
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1500)

#set for reproducibility
#check current working directory

if (os.getcwd().split('\\')[-1] != 'data'):
    os.chdir('.\data')
print(os.getcwd())

In [None]:
def load_metabolomics(filename):
    # loading in TB plasma metabolomics data from tab-delimted file to pandas dataframe
    df = pd.read_csv(filename)
    df = df.rename(columns={df.columns.values[0]: 'metabolite_name'})
   
    df = df.transpose()
    df.columns = df.iloc[0, :]
    df = df.iloc[1:, :]
    df.index.name = 'sample_id'
    
    return df

In [None]:
def impute(df, thresh=0.1):
    #drop columns with proportion missing values > threshold
    null_allowed = len(df.index) * thresh
    null_columns = df.columns.values[df.isnull().sum() > null_allowed]
    df = df.drop(columns=null_columns) 
    #impute remaining nans with minimum value
    df = df.apply(lambda x: x.fillna(x.min()), axis=0)
    return df.dropna(axis=1)

In [None]:
def load_patientmetadata(filename, m_df):
    # reading in patient metadata
    p_df = pd.read_csv(filename)
    p_df.columns = p_df.columns.str.lower()
    p_df = p_df.set_index('sample_id')
    #drop redundant columns
    p_df = p_df.drop(columns=[p_df.columns.values[0], 'id'])
    #join with full dataset
    m_df = m_df.set_index('sample_id').join(p_df)

    return m_df.reset_index(), p_df.reset_index()

In [None]:
# 
def standardize_data(f_vals):
    from sklearn import preprocessing

    # applying standardization 
    scaler = preprocessing.QuantileTransformer()#StandardScaler()
    data_scaled = scaler.fit_transform(f_vals)
    
    return data_scaled

In [None]:
def make_df(f_vals, features, l_vals, labels):
    df = pd.concat([pd.DataFrame(data=l_vals, columns=labels), 
                    pd.DataFrame(data=f_vals, columns=features)], axis=1)
    return df

In [None]:
def perform_PCA(data, l_vals, labels, save=False, ncomp=10):
# computing principal components
    from sklearn import decomposition

    pcaAbs = decomposition.PCA(n_components=ncomp)
    data_PCA = pcaAbs.fit_transform(data)
    
    pc_cols = ['PC ' + str(i) for i in np.arange(1, ncomp + 1)]
    df_PCA = make_df(data_PCA, pc_cols, l_vals, labels)
    
    #Plot explained variance by number of components
    var_exp = pcaAbs.explained_variance_ratio_
    fig_ve, ax_ve = plt.subplots(1, 1)
    sns.lineplot(x=(np.arange(len(var_exp)) + 1), y=np.cumsum(var_exp), ax=ax_ve)
    plt.xlabel('PCA component number')
    plt.ylabel('Cumulative variance ratio')
    if save:
        plt.savefig('variance-exp.png', bbox_inches='tight', pad_inches=0.5)
    
    fig_pca, ax_pca = plt.subplots(1, 1)
    sns.scatterplot(x='PC 1', y='PC 2', data=df_PCA, hue='group', ax=ax_pca)
    
    return df_PCA

In [None]:
#calculate p-value on continuous data, selecting appropriate test based on normality & equal variance tests
def significanceTest(ctrl, case, alpha_normal=0.05):
    try:
        _, p_normal_ctrl = sp.stats.normaltest(ctrl, nan_policy='omit')
        _, p_normal_case = sp.stats.normaltest(case, nan_policy='omit')
    except:
        p_normal_ctrl = 1 
        p_normal_case = 1
    #print(metab, p_normal_ctrl, p_normal_case)
    
    if (p_normal_ctrl < alpha_normal and p_normal_case < alpha_normal):
        _, p_var = sp.stats.bartlett(ctrl, case)
        _, p_diff = sp.stats.ttest_ind(ctrl, case, nan_policy='omit', equal_var=(p_var < alpha_normal))
    else:
        _, p_diff = sp.stats.ranksums(ctrl, case)
    
    return p_diff

In [None]:
def significantMetabolites(ctrl, case, features, labels, alpha_normal=0.05, alpha_diff=0.05):
    metabolites = features[:-1]
    #print(metabolites)
    
    pvals = []

    for metab in metabolites:
        metab_ctrl = ctrl[metab].values 
        metab_case = case[metab].values

        p_diff = significanceTest(metab_ctrl, metab_case, alpha_normal=alpha_normal)
        pvals.append(p_diff)
        
    padj = multi.multipletests(pvals, alpha=alpha_diff, method='fdr_bh')

    significant = pd.DataFrame({'metabolite' : metabolites, 'p' :  pvals, 'q' : padj[1]})
    display(significant.sort_values(by='p'))
    
    return significant

In [None]:
all_files = ['measurements_plasma_full.csv', 'measurements_serum_full.csv', 'measurements_plasmarpmi_full.csv']
all_df = []
for file in all_files:
    temp_df = impute(load_metabolomics(file))
    index = temp_df.index
    temp_df = pd.DataFrame(data=standardize_data(temp_df), columns=temp_df.columns)
    temp_df.index = index
    all_df.append(temp_df)
    
full_df   = pd.concat(all_df, sort=False).reset_index()
full_df, patient_df = load_patientmetadata('full_unblinded_metadata_with_smoking_tst.csv', full_df)
full_df.to_csv('standardized_TB_metabolomes.csv')

labels = list(patient_df)
features = [x for x in full_df.columns.to_list() if x not in labels]
f_vals = full_df.loc[:, features].values
l_vals = full_df.loc[:, labels].values

# displaying shape and first few data entries
print('The shape of our data matrix is: ', full_df.shape)

In [None]:
full_df.head()

In [None]:
## HOW WELL DO SAMPLE PREPS CORRELATE?
#Extract donors for which there are multiple sample types at a given timepoint
dup_df = full_df[full_df.groupby(['donor_id', 'timepoint'])['sample_type'].transform('nunique') > 1] #ends up being only paired
#For each donor at each timepoint, calculate a correlation coefficient
dup_groups = dup_df.groupby(['donor_id', 'timepoint'])

corr = []
sig = []
donors = []
times = []
sample_types = []
for (donor, time), group in dup_groups:
    sample_types.append(group['sample_type'].values)
    donors.append(donor)
    times.append(time)
    
    shared_features = group[features].dropna(axis=1).T #drop columns that are not shared
    corr_temp, sig_temp = sp.stats.pearsonr(shared_features.values[:, 0], shared_features.values[:, 1])
    corr.append(corr_temp)
    sig.append(sig_temp)

corr_df = pd.DataFrame({'donor' : donors, 'timepoint' : times, 'sample_types' : sample_types, 
                        'Pearson correlation' : corr, 'p value' : sig, 'q value' : multi.multipletests(sig, method='fdr_bh')[1]})

In [None]:
#print(group['sample_type'].values)
display(corr_df)

In [None]:
#WHAT METABOLITES DIFFER SIGNIFICANTLY?
#--what's the best way to do this analysis? 
## analyze a bunch of different resolutions and see what the overlap is
for (sample_type, timepoint), group in full_df.groupby(['sample_type', 'time_to_tb']):
    #group = group
    ctrl = group[group['group'].str.contains('control')][features].dropna(axis=1)
    case = group[group['group'].str.contains('case')][features].dropna(axis=1)
    
    significant = significantMetabolites(ctrl, case, list(ctrl), labels)

In [None]:
#WHAT METABOLITES CORRELATE WITH RISK? 
#analyzing separately by sample type (as broad as possible, color by location)
#spearman correlation with progressor status (y/n)
#pearson correlation with time to tb (metabolite-by-metabolite) 
#for a select few, show ones that go up, down, etc. relative to controls

In [None]:
#WHAT'S THE SIGNAL TO NOISE RATIO?
#Within same individual, how does the metabolite change over time? (Means of std. dev, Pearson correlation)
#Identifying highly variable metabolites
#Pearson correlation between individuals in the same "case-control" match

In [None]:
#GSEA but for metabolites???? (Do we need this?)
#