In [None]:
import pandas as pd
import seaborn as sns

In [None]:
metadata = pd.read_csv('/Volumes/PGH-Backup/ibd_data/metadata/hmp2_metadata_2018-08-20.csv')

In [None]:
rnaseq = metadata[metadata['data_type'] == 'host_transcriptomics']

display(rnaseq)

In [None]:
metagenomics = pd.read_csv('/Volumes/PGH-Backup/ibd_data/metadata/hmp2_metagenomics_metadata.csv')

In [None]:
display(rnaseq.head())
print(metagenomics.shape)

In [None]:
# Filter df1 based on the Participant ID column in df2
metagenomics_filt = metagenomics[metagenomics['Participant ID'].isin(rnaseq['Participant ID'])]

print(len(metagenomics_filt['Participant ID'].unique()))

In [None]:
merged_df = pd.merge(rnaseq, metagenomics_filt, on='Participant ID', how='left', suffixes=('_rnaseq', '_metagenomics'))

display(merged_df)

In [None]:
merged_df['External ID_metagenomics']

In [None]:
# Filter for the closest timepoint or earliest if none is before or at the RNA-seq timepoint
def find_closest_timepoint(group):
    # Filter for metagenomic samples that are before or at the RNA-seq timepoint
    valid_samples = group[group['week_num_metagenomics'] <= group['week_num_rnaseq'].iloc[0]]
    
    if not valid_samples.empty:
        # Find the sample with the closest (maximum) timepoint
        closest_sample = valid_samples.loc[valid_samples['week_num_metagenomics'].idxmax()]
    else:
        # If no valid sample exists, return the earliest metagenomic sample
        closest_sample = group.loc[group['week_num_metagenomics'].idxmin()]
    
    return pd.Series([closest_sample['External ID_metagenomics'], closest_sample['week_num_metagenomics']], 
                     index=['External ID_metagenomics', 'week_num_metagenomics'])

In [None]:
closest_samples = merged_df.groupby(['Participant ID', 'Project_rnaseq']).apply(find_closest_timepoint).reset_index()

In [None]:
rnaseq = rnaseq.rename(columns={'Project': 'Project_rnaseq'})

In [None]:
# Merge the closest samples back to the original RNA-seq DataFrame
final_df = pd.merge(rnaseq, closest_samples, on=['Participant ID', 'Project_rnaseq'], how='left')

In [None]:
final_df['External ID_metagenomics']

In [None]:
final_df.to_csv('/Volumes/PGH-Backup/ibd_data/metadata/matched_rnaseq_mgx_sample_mapping.csv', index=False)

In [None]:
for_viewing = final_df[['Participant ID', 'External ID', 'External ID_metagenomics', 'week_num', 'week_num_metagenomics', 'Project_rnaseq']]

display(for_viewing)

In [None]:
for_viewing.to_csv('/Volumes/PGH-Backup/ibd_data/metadata/matched_rnaseq_mgx_sample_mapping.csv', index=False)

**Use Mapping File to Join MGX and RNA Data for Appropriate Samples**

In [None]:
# Load datasets
mgx = pd.read_csv('/Volumes/PGH-Backup/ibd_data/humann_second_run/ibd_genefamilies_relab_clustered.tsv', sep='\t')
rna = pd.read_csv('/Volumes/PGH-Backup/ibd_data/rnaseq/tmm_normalized_counts.tsv', sep='\t')
mapping = pd.read_csv('/Volumes/PGH-Backup/ibd_data/metadata/matched_rnaseq_mgx_sample_mapping.csv')

In [None]:
# Tanspose so that Genes are columns and samples are rows in RNAseq data
rna = rna.set_index('Gene').T

rna.shape

In [None]:
display(mgx.head())
display(rna.head())
display(mapping)

In [None]:
rna = rna.reset_index().rename(columns={'index': 'sample_id'})
display(rna.head())

In [None]:
# Create list of enriched NOD2 genes
with open ('/Volumes/PGH-Backup/ibd_data/rnaseq/GSE22611_NOD2_VS_CTRL_TRANSDUCED_HEK293T_CELL_UP.v2024.1.Hs.grp', 'r') as f:
    gene_list = [line.strip() for line in f.readlines()]
    gene_list = gene_list[2:]

print(gene_list)

In [None]:
# Subset for genes present in table
present_genes = [gene for gene in gene_list if gene in rna.columns]

present_genes.append('sample_id')

In [None]:
# Subset rnaseq data
rna_filt = rna[present_genes]

print(rna_filt.head())

In [None]:
# write out test subset rnaseq file:
rna_filt.to_csv('/Volumes/PGH-Backup/ibd_data/rnaseq/test_subset_rnaseq.tsv', index=False, sep='\t')

In [None]:
merged_mgx = pd.merge(rna_filt, mapping, left_on='sample_id', right_on='External ID', how='inner')

display(merged_mgx)

In [None]:
merged_mgx_rna = pd.merge(merged_mgx, mgx, left_on='External ID_metagenomics', right_on= 'sample_id', how='left')

display(merged_mgx_rna)

In [None]:
merged_mgx_rna.to_csv('/Volumes/PGH-Backup/ibd_data/rnaseq/test_subset_rnaseq_mgx.tsv', index=False, sep='\t')