In [29]:
import biom
import pandas as pd
import numpy as np
import qiime2 as q2 
from qiime2.plugins import taxa

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss

import statsmodels.regression.mixed_linear_model as sm

In [30]:
from biom import Table
# load biom table and metadata 
thdmi_ft = q2.Artifact.load('../data/filtered_ft.qza')
thdmi_df = thdmi_ft.view(pd.DataFrame).transpose()
thdmi_bt = Table(thdmi_df.values, thdmi_df.index, thdmi_df.columns)
wol2_tax = q2.Artifact.import_data('FeatureData[Taxonomy]', '/projects/wol/qiyun/wol2/taxonomy/lineages.txt', 'HeaderlessTSVTaxonomyFormat')

thdmi_md = q2.Metadata.load('/projects/thdmi/metadata/consolidated_metadata_subset.tsv').to_dataframe()

In [31]:
thdmi_genus = taxa.methods.collapse(table=thdmi_ft, taxonomy=wol2_tax, level=6).collapsed_table

In [32]:
thdmi_genus_df = thdmi_genus.view(pd.DataFrame) 

In [33]:
faecali_to_bacteroides = np.log(thdmi_genus_df['d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Ruminococcaceae;g__Faecalibacterium'] / 
                                thdmi_genus_df['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides'])
prevotella_to_bacteroides = np.log(thdmi_genus_df['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella'] / 
                                   thdmi_genus_df['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides'])

In [34]:
nutrient_intake = thdmi_md.columns[thdmi_md.columns.str.contains('Percent_of_calories')]
amount = thdmi_md.columns[thdmi_md.columns.str.startswith('Amount_')]
hei_2015 = thdmi_md.columns[thdmi_md.columns.str.startswith('Hei2015')]
collapsed_foods = thdmi_md.columns[thdmi_md.columns.str.startswith('Collapsed_')]
additional_cols = ['DP_All', 'DP_Core', 'types_of_plants_coded', 'diet_type_coded', 'fermented_plant_frequency_coded', 'specialized_diet_exclude_dairy', 
                   'specialized_diet_exclude_refined_sugars', 'Processed_Calories_Nova_processed_foods', 'Processed_Calories_Nova_ultra_processed_foods',
                   'Processed_Grams_Nova_ultra_processed_foods', 'Processed_Grams_Nova_processed_foods', 'Processed_Grams_Nova_unprocessed_or_minimally_processed_foods', 
                   'Processed_Calories_Nova_unprocessed_or_minimally_processed_foods', 'thdmi_cohort', 'covid_level_of_wellbeing_coded', 'bmi_cat_coded',
                   'antibiotic_history_coded', 'sex']

In [35]:
thdmi_md_subset = thdmi_md[nutrient_intake.union(amount).union(hei_2015).union(collapsed_foods).union(additional_cols)]


for c in ['Processed_Calories_Nova_processed_foods', 'Processed_Calories_Nova_ultra_processed_foods', 'Processed_Calories_Nova_unprocessed_or_minimally_processed_foods']: 
    thdmi_md_subset.insert(thdmi_md_subset.shape[1], c + '_per1000kcal', thdmi_md_subset[c] * 10)
    thdmi_md_subset[c] = (1/100) * thdmi_md_subset[c] * thdmi_md_subset['Amount_Energy_in_kcal']
    

for g in ['Processed_Grams_Nova_ultra_processed_foods', 'Processed_Grams_Nova_processed_foods', 'Processed_Grams_Nova_unprocessed_or_minimally_processed_foods']: 
    new_col = thdmi_md_subset[g].copy().values
    thdmi_md_subset[g] = (1/100) * thdmi_md_subset[g] * thdmi_md_subset['Amount_Total_Grams_in_g']
    thdmi_md_subset.insert(thdmi_md_subset.shape[1], g.replace('Grams', 'Percent_Grams'), new_col)

thdmi_md_subset = thdmi_md_subset.loc[thdmi_md_subset.index.intersection(thdmi_bt.ids())]
thdmi_md_subset.insert(thdmi_md_subset.shape[1], 'log_Faecalibacterium_to_Bacteroides', faecali_to_bacteroides)
thdmi_md_subset.insert(thdmi_md_subset.shape[1], 'log_Prevotella_to_Bacteroides', prevotella_to_bacteroides)

for c in thdmi_md_subset.columns:
    if '+' in c: 
        thdmi_md_subset.rename(columns={c: c.replace('+', 'and')}, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  thdmi_md_subset[c] = (1/100) * thdmi_md_subset[c] * thdmi_md_subset['Amount_Energy_in_kcal']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  thdmi_md_subset[c] = (1/100) * thdmi_md_subset[c] * thdmi_md_subset['Amount_Energy_in_kcal']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  thdmi_md_subset[c] 

In [36]:
prev_bacteroides_plots = thdmi_md_subset[['thdmi_cohort', 'log_Faecalibacterium_to_Bacteroides', 'log_Prevotella_to_Bacteroides', 'DP_All', 'DP_Core']]
prev_bact_mx_excluded = prev_bacteroides_plots.loc[prev_bacteroides_plots['thdmi_cohort'] != 'Mexico']
prev_bact_uk_excluded = prev_bacteroides_plots.loc[prev_bacteroides_plots['thdmi_cohort'] != 'UK']
prev_bact_us_excluded = prev_bacteroides_plots.loc[prev_bacteroides_plots['thdmi_cohort'] != 'US']
prev_bacteroides_melted = prev_bacteroides_plots[['thdmi_cohort', 'log_Faecalibacterium_to_Bacteroides', 
                                                  'log_Prevotella_to_Bacteroides']].melt(id_vars=['thdmi_cohort'], var_name='ratio', value_name='log_ratio')

In [37]:
thdmi_md_subset.loc[thdmi_md_subset['log_Prevotella_to_Bacteroides'] > 0].groupby('thdmi_cohort').count()

Unnamed: 0_level_0,Amount_3_Methylhistidine_in_mg,Amount_Acesulfame_Potassium_in_mg,Amount_Added_Sugars__by_Available_Carbohydrate__in_g,Amount_Added_Sugars__by_Total_Sugars__in_g,Amount_Alanine_in_g,Amount_Alcohol_in_g,Amount_Alpha_Carotene__provitamin_A_carotenoid__in_mcg,Amount_Alpha_Tocopherol_in_mg,Amount_Animal_Protein_in_g,Amount_Arginine_in_g,...,specialized_diet_exclude_refined_sugars,types_of_plants_coded,Processed_Calories_Nova_processed_foods_per1000kcal,Processed_Calories_Nova_ultra_processed_foods_per1000kcal,Processed_Calories_Nova_unprocessed_or_minimally_processed_foods_per1000kcal,Processed_Percent_Grams_Nova_ultra_processed_foods,Processed_Percent_Grams_Nova_processed_foods,Processed_Percent_Grams_Nova_unprocessed_or_minimally_processed_foods,log_Faecalibacterium_to_Bacteroides,log_Prevotella_to_Bacteroides
thdmi_cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mexico,153,153,153,153,153,153,153,153,153,153,...,159,157,153,153,153,153,153,153,159,159
UK,55,55,55,55,55,55,55,55,55,55,...,55,54,55,55,55,55,55,55,55,55
US,61,61,61,61,61,61,61,61,61,61,...,61,61,61,61,61,61,61,61,61,61


In [38]:
thdmi_md_subset['log_Prevotella_to_Bacteroides'].quantile([0, 0.25, 0.5, 0.75, 1])

0.00   -6.458629
0.25   -3.713944
0.50   -3.030166
0.75   -0.257650
1.00    4.433951
Name: log_Prevotella_to_Bacteroides, dtype: float64

In [39]:
thdmi_md_subset.to_csv('log_ratio_foods_analysis.csv')