# Statistical Analysis of Alpha Diversity vs. Metadata Feature

In [1]:
import pandas as pd
from scipy.stats import kruskal
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import numpy as np

Read in local copies of American Gut Project metadata and precomputed alpha diversity calculations by sample id. Only specific features will be analyzed in this analysis.

In [2]:
# read local csvs
alpha_diversity = pd.read_csv(r'C:\Users\bwesterber\Downloads\alpha_diversity.csv')
agp_only_meta = pd.read_csv(r'C:\Users\bwesterber\Downloads\agp_only_meta.csv')

# select only fecal samples
agp_only_meta = agp_only_meta[agp_only_meta['env_material'] == 'feces']

# select categories for analysis from AGP metadata
features = ['sample_id', 'age_cat', 'antibiotic_history', 'prepared_meals_frequency', 'race', 'salted_snacks_frequency', 
               'sugary_sweets_frequency', 'diabetes_type', 'flossing_frequency']

agp_only_meta = agp_only_meta[features]

# join on sample_id
df = agp_only_meta.merge(alpha_diversity, how = 'inner', on = 'sample_id')

  interactivity=interactivity, compiler=compiler, result=result)


Add in helping functions for generating distributions for each category, calculating the test statistic, and plotting the results.

In [3]:
# plotting and test functions

def generate_alpha_distributions(feature):
    unique_features = df[feature].unique()
    distributions = []
    for unique_feature in unique_features:
        distributions.append(df['observed_otus'][df[feature] == unique_feature].values)
    return unique_features, distributions

def test_significance_kruskal(feature, data, alpha):
    # calculate test statistic
    statistic, p_value = kruskal(*[list(x) for x in data])
    # if significant run tukey HSD post hoc
    if p_value < alpha:
        mc = MultiComparison(df['observed_otus'], df[feature])
        result = mc.tukeyhsd()
        print(result)
    return statistic, p_value

def plot_distributions(feature, unique_features, distributions, p_value):
    fig = plt.figure(figsize = (12, 9))
    plt.boxplot(distributions, labels = unique_features, showcaps = True, showbox = True)
    plt.title('Alpha Diversity by {} (p = {})'.format(feature, p_value, 'E'))
    plt.ylabel('Alpha Diversity (Observed OTUs)')
    plt.xlabel('{}'.format(feature))
    plt.xticks(rotation = 45)
    return

Evaluate each category. If alpha < 0.05 run a post hoc test to see what the significant differences are.

In [7]:
categories = ['age_cat', 'antibiotic_history', 'prepared_meals_frequency', 'race', 'salted_snacks_frequency', 
               'sugary_sweets_frequency', 'diabetes_type', 'flossing_frequency']
p_vals = []

for category in categories:
    unique_features, distributions = generate_alpha_distributions(category)
    statistic, p_value = test_significance_kruskal(category, distributions, 0.05)
    p_vals.append(p_value)
    #plot_distributions(category, unique_features, distributions, p_value)
    

      Multiple Comparison of Means - Tukey HSD,FWER=0.05     
   group1       group2     meandiff   lower    upper   reject
-------------------------------------------------------------
    20s          30s       10.7801    1.3994  20.1607   True 
    20s          40s       13.2345    3.8725  22.5966   True 
    20s          50s       21.1651   11.8559  30.4742   True 
    20s          60s       25.6201   16.3002   34.94    True 
    20s          70+       31.9963   19.7521  44.2406   True 
    20s      Not provided  -11.9014  -23.9954  0.1926  False 
    20s      Unspecified   -17.5911  -44.6161  9.4339  False 
    20s          baby      -88.7148 -189.0061 11.5766  False 
    20s         child      -24.7322  -38.3232 -11.1412  True 
    20s          teen       -9.552   -26.5749  7.471   False 
    30s          40s        2.4545   -5.3718  10.2808  False 
    30s          50s        10.385    2.622    18.148   True 
    30s          60s        14.84     7.0641  22.6159   True 
    30s 

Run Bonferroni corrections on calculated p-values. This will help us control for false discovery, but might be too conservative.

In [13]:
# bonferroni corrections
corrected_p_vals = multipletests(p_vals, alpha = 0.05, method = 'bonferroni')
df_summary_stats = pd.DataFrame({'Category':categories, 'P-value':p_vals, 'Bonferroni Corrected P-value':corrected_p_vals[1]})
df_summary_stats

Unnamed: 0,Category,P-value,Bonferroni Corrected P-value
0,age_cat,1.298892e-76,1.039114e-75
1,antibiotic_history,4.771782e-55,3.817426e-54
2,prepared_meals_frequency,2.101691e-19,1.681353e-18
3,race,8.148574999999999e-50,6.51886e-49
4,salted_snacks_frequency,1.3702170000000001e-25,1.096174e-24
5,sugary_sweets_frequency,1.036762e-06,8.294096e-06
6,diabetes_type,3.996243e-13,3.196994e-12
7,flossing_frequency,4.364542e-22,3.491634e-21
