In [None]:
import random
import numpy as np
import scipy as sp
import collections
import pandas as pd
from pylab import *
import seaborn as sns
import os
from matplotlib import pyplot as plt
from IPython.display import clear_output

sns.set_style('white')
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1500)

#set for reproducibility
os.chdir('.\data')

In [None]:
def load_metabolomics(filename):
    
    # loading in TB plasma metabolomics data from tab-delimted file to pandas dataframe
    df = pd.read_csv(filename)#, sep='\t', lineterminator='\r') # loading data
    df = df.rename(columns={df.columns.values[0]: 'metabolite_name'})
    #df['metabolite_name'] = df['metabolite_name'].str.strip('\n') # getting rid of line-terminator
    
    df = df.transpose()
    df.columns = df.iloc[0, :]
    df = df.iloc[1:, :]#df.index.name = 'local_sample_id'
    df.index.name = 'sample_id' #= df.rename(columns={'metabolite_name': 'local_sample_id'})
    #df = df.rename_axis(None)
    
    return df

In [None]:
def impute(df, thresh=0.1):
    null_allowed = len(df.index) * thresh
    null_columns = df.columns.values[df.isnull().sum() > null_allowed]
    
    df = df.drop(columns = null_columns) 
    #TODO: drop null columns rather than any with empty valus, impute with minimum value
    
    return df.dropna(axis=1)

In [None]:
def load_patientmetadata(filename, m_df):
    # reading in patient metadata
    p_df = pd.read_csv(filename)
    p_df.columns = p_df.columns.str.lower()
    p_df = p_df.set_index('sample_id')
    #drop redundant columns
    p_df = p_df.drop(columns=[p_df.columns.values[0], 'id'])
    #join with full dataset
    m_df = m_df.set_index('sample_id').join(p_df)

    return m_df.reset_index(), p_df.reset_index()

In [None]:
# standardizing data by making values (features) zero-mean and unit-variance
def standardize_data(f_vals):
    from sklearn import preprocessing

    # applying standardization 
    standardizerScaler = preprocessing.StandardScaler()
    data_StandardScaled = standardizerScaler.fit_transform(f_vals)
    
    return data_StandardScaled

In [None]:
def make_df(f_vals, features, l_vals, labels):
    df = pd.concat([pd.DataFrame(data=l_vals, columns=labels), 
                    pd.DataFrame(data=f_vals, columns=features)], axis=1)
    return df

In [None]:
def perform_PCA(data, l_vals, labels, save=False, ncomp=10):
# computing principal components
    from sklearn import decomposition

    pcaAbs = decomposition.PCA(n_components=ncomp)
    data_PCA = pcaAbs.fit_transform(data)
    
    pc_cols = ['PC ' + str(i) for i in np.arange(1, ncomp + 1)]
    df_PCA = make_df(data_PCA, pc_cols, l_vals, labels)
    
    #Plot explained variance by number of components
    var_exp = pcaAbs.explained_variance_ratio_
    fig_ve, ax_ve = plt.subplots(1, 1)
    sns.lineplot(x=(np.arange(len(var_exp)) + 1), y=np.cumsum(var_exp), ax=ax_ve)
    plt.xlabel('PCA component number')
    plt.ylabel('Cumulative variance ratio')
    if save:
        plt.savefig('variance-exp.png', bbox_inches='tight', pad_inches=0.5)
    
    fig_pca, ax_pca = plt.subplots(1, 1)
    sns.scatterplot(x='PC 1', y='PC 2', data=df_PCA, hue='group', ax=ax_pca)
    
    return df_PCA

In [None]:
plasma_df = load_metabolomics('measurements_plasma_full.csv')
serum_df  = load_metabolomics('measurements_serum_full.csv')
rpmi_df   = load_metabolomics('measurements_plasmarpmi_full.csv')

full_df   = pd.concat([plasma_df, serum_df, rpmi_df], sort=False).reset_index()
full_df, patient_df = load_patientmetadata('full_unblinded_metadata_with_smoking_tst.csv', full_df)
#metabolomicsTB_df = impute(metabolomicsTB_df)

# displaying shape and first few data entries
print('The shape of our data matrix is: ', full_df.shape)
#metabolomicsTB_df.head()


In [None]:
full_df.reset_index().head()

In [None]:
labels = ['group', 'mb_sample_id', 'timepoint', 'region', 'gender']
features = [x for x in metabolomicsTB_df.columns.to_list() if x not in labels]
f_vals = metabolomicsTB_df.loc[:, features].values
l_vals = metabolomicsTB_df.loc[:, labels].values


std_data = standardize_data(f_vals)
std_df = make_df(std_data, features, l_vals, labels)
# displaying shape and first few data entries
print('The shape of the standardized data is:', std_df.shape)
#display(std_df.head())

In [None]:
males_data = std_data[ std_df['timepoint'].str.contains('M')]
males_labels = l_vals[ std_df['timepoint'].str.contains('M')]
pca_df = perform_PCA(males_data, males_labels, labels, save=False, ncomp=10)
display(pca_df.head())

In [None]:
metabolites = list(metabolomicsTB_df.columns)[:-1]
numMetabas = metabolomicsTB_df.shape[1] - 1
alpha = 0.05

significant = []
for metab in metabolites:
    indicesControl = metabolomicsTB_df['group'] == 'control'
    indicesCase = metabolomicsTB_df['group'] == 'case'
    
    metabAverageControl = list(metabolomicsTB_df.loc[indicesControl, metab])
    metabAverageCase = list(metabolomicsTB_df.loc[indicesCase, metab])
    stat, p_val = sp.stats.ranksums(metabAverageControl, metabAverageCase)
    
    if p_val  < alpha / numMetabas:
        significant.append(metab)

In [None]:
significant

In [None]:
#Do Spearman correlation for healthy vs. disease
display(metabolomicsTB_df.sort_values(by=['timepoint', 'group']))
#df.corr(method='spearman')

In [None]:
#os.chdir('.\data')
ut_file = os.path.join('.\data\met_data', 'ST000974_AN001595_res.txt')
ut_met = pd.read_csv(ut_file, sep='\t', lineterminator='\r')
ut_met['Samples'] = ut_met['Samples'].str.strip('\n')
metadata = ut_met['group'].str.rsplit(' | ', expand=True)
cols = metadata.iloc[0, :].str.rsplit(':', expand=True)[0].str.lower()
metadata.columns = cols
metadata = metadata.apply(lambda x: x.str.rsplit(':', expand=True)[1])

ut_met = ut_met.drop(columns='group')
ut_met = ut_met.join(metadata)
ut_met.dropna(axis=0)
#display(metadata)
#display(ut_met)



In [None]:
ut_met.columns = (ut_met.columns.str.lstrip('X - ')
                                .str.lstrip('0')
                                .str.lower())
ut_met = ut_met.fillna(ut_met.min())  #impute missing values as = limit of detection
ut_met.head()



In [None]:
labels = ['samples', 'timepoint', 'region', 'gender']
mz = [x for x in ut_met.columns.to_list() if x not in labels]
ut_mz = ut_met[mz]
ut_met_summary = ut_mz.groupby(['group']).agg([('Mean', np.nanmean), ('Std', np.nanstd)])
display(ut_met_summary)

In [None]:
metabolites = mz[:-1]
#print(metabolites)
numMetabas = len(mz)
alpha_diff = 0.05
alpha_normal = 0.05
significant = []
for metab in metabolites:
    indicesControl = np.array(ut_met['group'] == 'control') * np.array(ut_met['timepoint'] != 'BL')
    indicesCase = np.array(ut_met['group'] == 'case') * np.array(ut_met['timepoint'] != 'BL')
    
    
    metabAverageControl = list(ut_met.loc[indicesControl, metab])
    metabAverageCase = list(ut_met.loc[indicesCase, metab])
    
    
    _, p_normal_ctrl = sp.stats.normaltest(metabAverageControl, nan_policy='omit')
    _, p_normal_case = sp.stats.normaltest(metabAverageCase, nan_policy='omit')
    #print(metab, p_normal_ctrl, p_normal_case)
    
    if (p_normal_ctrl < alpha_normal and p_normal_case < alpha_normal):
        _, p_var = sp.stats.bartlett(metabAverageControl, metabAverageCase)
        _, p_diff = sp.stats.ttest_ind(metabAverageControl, metabAverageCase, nan_policy='omit', equal_var=(p_var<alpha_normal))
    else:
        _, p_diff = sp.stats.ranksums(metabAverageControl, metabAverageCase)
    
    if p_diff  < alpha_diff:
        significant.append(float(metab))
        
#print(significant)
significant = pd.DataFrame({'mz' : significant})
display(significant)

significant.to_csv('significant_mz.csv')

In [None]:
##ADAPTED FROM CDUVALLET
#positive ion mode
def calculate_adducts(mzs):
    m = {'C': 12.0,
         'H': 1.007825,
         'O': 15.994915,
         'N': 14.003074,
         'S': 31.972072,
         'Cl': 34.968853,
         'F': 18.998403,
         'B': 10.012938,
         'D': 2.014102,
         'Na': 22.989770,
         'K' : 39.0983, 
         'e-': 0.000548579909}
    
    plush = [(float(i) - m['H']) for i in mzs]
    plusnh4 = [(float(i) - (m['N'] + 4 * m['H'])) for i in mzs] 
    plush2o = [(float(i) - (2 * m['H'] + m['O'])) for i in mzs]
    plusna = [(float(i) - m['Na']) for i in mzs]
    plusk = [(float(i) - m['K']) for i in mzs]
    plusch3cn = [(float(i) - (2 * m['C'] + 3 * m['H'] + m['N'])) for i in mzs]
    return plush, plusnh, plush2o, plusna, plusk, plusch3cn

def parse_HMDB(xml_file):
    from lxml import etree
    ## ADAPTED FROM CDUVALLET
    outdict = {}
    tree = etree.iterparse(xml_file, tag='metabolite')
    counter = 0

    for event, elem in tree:
        counter += 1
        if counter % 2000 == 0:
            print('Reading {}th metabolite from HMDB'.format(counter))
        tmpdict = {}
        tmpdict['name'] = elem.findtext('name')
        if elem.findtext('monisotopic_moleculate_weight'):
            tmpdict['neutral_mass'] = elem.findtext('monisotopic_moleculate_weight')
        else:
            tmpdict['neutral_mass'] = '0.0'

        outdict[elem.findtext('accession')] = tmpdict
        elem.clear()
    return outdict



In [None]:
xml_file = os.path.join('.\data\met_data', 'cleaned_serum_metabolites.xml')
outdict = parse_HMDB(xml_file)
