In [1]:
# Setup

# Libraries
import pandas as pd

In [2]:
# Read in metabolomics data
mbx = pd.read_csv('./data/iHMP_metabolomics.csv',
                   low_memory=False)
# Subset metabolites to labeled sets
mbx = mbx[pd.notnull(mbx.Metabolite)]
mbx.to_csv('./data/iHMP_labeled_metabolomics.csv',
           index=False)
# Get metabolite list
mbx_list = mbx.Metabolite
# Reshape metabolites for merge
mbx = mbx.iloc[:,7:].transpose().rename(columns = mbx_list)

# Read in metadata for timing of samples
meta = pd.read_csv('./data/iHMP_metadata.csv',
                   low_memory=False)
# Subset metadata to appropriate samples 
meta = meta[meta['External ID'].isin(mbx.index)].query("data_type == 'metabolomics'")
meta.set_index('External ID', inplace = True)

In [3]:
# Merge metabolomics info to metadata
df = (meta.merge(mbx, left_index = True, 
           right_index = True).
      drop_duplicates().
      rename(columns = {'Participant ID': 'id',
                        'date_of_receipt': 'date'
                       }))

# Fix the date column
df.date = pd.to_datetime(df.date)

# Now scale the dates compared to the earliest study date
min_date = df.date.min()
df['days_from_start'] = (df.date - min_date).dt.days

# Drop columns that aren't needed (drop id for the moment)
df = df.drop(columns=['date'])#, 'id'])

# Drop duplicate metabolites
df = df.loc[:,~df.columns.duplicated()]

# # Only keep CD observations because they have hbi
# df = df.query("diagnosis == 'CD'").drop(columns = ['diagnosis'])

# # Only keep non-missing HBI for the moment
# df = df[df.hbi.notna()]

# # Standardize hbi and days for convergence properties
# df.days_from_start = (df.days_from_start - df.days_from_start.mean())/df.days_from_start.std()
# df.hbi = (df.hbi - df.hbi.mean())/df.hbi.std()


print(df.shape)
df.head()

(546, 1040)


Unnamed: 0,Project,id,site_sub_coll,data_type,week_num,interval_days,visit_num,Research Project,PDO Number,GSSR IDs,...,C16:1 LPC plasmalogen,C18:1 LPC plasmalogen,C18:0 LPE-A,C18:0 LPE-B,C22:6 LPE,sphingosine-isomer1,sphingosine-isomer2,sphingosine-isomer3,C14:0 SM,days_from_start
CSM5FZ3N,C3001C1_MBX,C3001,C3001C1,metabolomics,0.0,0.0,4,ibdmdb,,,...,287048.0,453688.0,9997928.0,3579319.0,2066660.0,274385476.0,136360623.0,211906815.0,36328.0,93
CSM5FZ4C,C3001C5_MBX,C3001,C3001C5,metabolomics,8.0,11.0,8,ibdmdb,,,...,113495.0,130737.0,16335239.0,13177075.0,4447184.0,121055143.0,65245288.0,301546342.0,51577.0,149
CSM5MCVV,C3001C9_MBX,C3001,C3001C9,metabolomics,16.0,15.0,13,ibdmdb,,,...,143163.0,3760.0,4371214.0,2130907.0,10511485.0,199161782.0,106462063.0,118183128.0,,211
CSM67UA2,C3001C15_MBX,C3001,C3001C15,metabolomics,28.0,13.0,20,ibdmdb,,,...,96874.0,18460.0,3306645.0,2323331.0,1555152.0,187973992.0,97169686.0,72151914.0,,293
CSM79HGP,C3001C20_MBX,C3001,C3001C20,metabolomics,38.0,15.0,26,ibdmdb,,,...,224205.0,,8323839.0,2529248.0,4901090.0,152639719.0,79560140.0,225812346.0,6212.0,364


In [4]:
# Save dataset
df.to_csv('./data/iHMP_merge.csv')