In [1]:
import pandas as pd
import qiime2 as q2
import biom

## Genome Coverage Filter

In [2]:
md = pd.read_csv('../data/64306_64306_analysis_mapping.txt', sep = '\t', dtype={'#SampleID': str}).set_index('#SampleID')

  md = pd.read_csv('../data/64306_64306_analysis_mapping.txt', sep = '\t', dtype={'#SampleID': str}).set_index('#SampleID')


In [3]:
#Feature table from Qiita analysis 64306
qiita_table = biom.load_table("/home/lakhatib/thdmi/data/64306_analysis_Metagenomic_Woltkav014DatabasescratchqpwoltkaWoLr2WoLr2BIOMnonebiom.biom").to_dataframe().transpose()
#import coverages from Micov
cov = pd.read_csv('coverages.coverage.tsv', sep = '\t')

In [4]:
ft = qiita_table.transpose()
cov = cov.set_index('genome_id')

cov = cov.loc[cov.index.intersection(ft.index)]

ft['percent_covered'] = cov['percent_covered']

#check how many features will remain with a 15% threshold 
sum(ft['percent_covered'] > 15)

1446

In [5]:
#apply filter
ft_filtered = ft[ft['percent_covered']  > 15]
ft_filtered = ft_filtered.transpose()

ft_filtered = ft_filtered.drop("percent_covered")

## Sample Filtering

In [6]:
print(f"Total Samples: {len(md)}")
print(md['thdmi_cohort'].value_counts())

Total Samples: 2747
Mexico            549
US                534
Japan             524
Spain             481
UK                366
not applicable    277
unknown            10
control sample      6
Name: thdmi_cohort, dtype: int64


In [7]:
#filter to valid countries
md = md.loc[md['thdmi_cohort'].isin(['Mexico', 'US', 'UK'])]

In [8]:
print(f"Total Samples: {len(md)}")
print(md['thdmi_cohort'].value_counts())

Total Samples: 1449
Mexico    549
US        534
UK        366
Name: thdmi_cohort, dtype: int64


In [9]:
md = md.loc[md.keep_sample_for_thdmi == 'yes']

In [10]:
print(f"Total Samples: {len(md)}")
print(md['thdmi_cohort'].value_counts())

Total Samples: 1291
Mexico    507
US        442
UK        342
Name: thdmi_cohort, dtype: int64


In [11]:
md = md.loc[md.index.intersection(ft_filtered.index)]

In [12]:
print(f"Total Samples: {len(md)}")
print(md['thdmi_cohort'].value_counts())

Total Samples: 1291
Mexico    507
US        442
UK        342
Name: thdmi_cohort, dtype: int64


In [13]:
#drop samples with invalid covariates
import numpy as np

md = md.loc[md['host_age'] != 'not provided']
md['host_age'] = md['host_age'].astype('float')
md = md.loc[(md['host_age'] >= 18) & (md['host_age'] <= 100)]

md['host_body_mass_index'] = md['host_body_mass_index'].replace('not provided', np.nan)

md['host_body_mass_index'] = md['host_body_mass_index'].astype(float)

md['host_body_mass_index'] = np.where((md['host_body_mass_index'] < 12) | (md['host_body_mass_index'] > 70), np.nan, 
                                            md['host_body_mass_index']) 

md['covid_level_of_wellbeing'] = md['covid_level_of_wellbeing'].replace('not provided', np.nan)

md['antibiotic_history'] = md['antibiotic_history'].replace('not provided', np.nan)

md = md.dropna(subset = ['antibiotic_history', 'covid_level_of_wellbeing', 'host_body_mass_index', 'host_age'])

In [14]:
print(f"Total Samples: {len(md)}")
print(md['thdmi_cohort'].value_counts())

Total Samples: 1218
Mexico    497
US        412
UK        309
Name: thdmi_cohort, dtype: int64


In [15]:
#match ft samples to metadata
ft_filtered = ft_filtered.loc[ft_filtered.index.intersection(md.index)]

## Feature Filtering

In [16]:
#save as .qza 

ft_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', ft_filtered)
ft_q2.save('../data/filtered_feature-table.qza')

'../data/filtered_feature-table.qza'

In [None]:
# filter against gg2
!qiime greengenes2 filter-features --i-feature-table /home/lakhatib/3country/final_scripts/data/filtered_feature-table.qza --i-reference /home/mcdonadt/greengenes2/release/2022.10/2022.10.taxonomy.asv.nwk.qza --o-filtered-feature-table /home/lakhatib/3country/final_scripts/data/gg2_filtered-feature-table.qza

In [None]:
#upload filtered table
ft_q2 = q2.Artifact.load('/home/lakhatib/3country/final_scripts/data/gg2_filtered-feature-table.qza')
ft = ft_q2.view(pd.DataFrame)

In [None]:
print(len(ft.columns))

In [None]:
# Define the threshold for relative abundance
relative_abundance_threshold = 0.000001  # 0.0001%

In [None]:
#Calculate total abundance for each sample
ft_ra = ft.copy()
ft_ra['total_abundance'] = ft.sum(axis=1)

In [None]:
ft_ra['total_abundance'] = ft_ra['total_abundance'] * relative_abundance_threshold

In [None]:
# Apply the threshold to each feature to check if it meets the relative abundance threshold
features_above_threshold = ft_ra.iloc[:, :-1].ge(ft_ra['total_abundance'], axis=0)

In [None]:
# Step 1: Calculate the percentage of samples where each feature meets the threshold
feature_prevalence = features_above_threshold.mean(axis=0)

In [None]:
# Step 2: Filter out features with prevalence < 10%
prevalence_threshold = 0.1  # 10%
features_to_keep = feature_prevalence[feature_prevalence >= prevalence_threshold].index

In [None]:
# Filter the original dataframe to keep only these features
filtered_features_df = ft_ra[features_to_keep]

In [None]:
filtered_features_df = filtered_features_df.loc[filtered_features_df.sum(axis=1) >= 1000000]

In [None]:
#make sure all samples are above 1000000
filtered_features_df.sum(axis=1).min()

In [None]:
md = md.loc[md.index.intersection(filtered_features_df.index)]

In [None]:
print(f"Total Samples: {len(md)}")
print(md['thdmi_cohort'].value_counts())

In [None]:
md = md.rename_axis('#SampleID')

In [None]:
md.to_csv('../data/subsetted_md.tsv', sep = '\t')

In [None]:
filtered_features_df = filtered_features_df.loc[filtered_features_df.index.intersection(md.index)]

In [None]:
filtered_features_df.to_csv('../data/filtered_ft.tsv', sep = '\t')

In [None]:
filtered_features_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', filtered_features_df)

In [None]:
filtered_features_q2.save('../data/filtered_ft.qza')