In [17]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import argparse

In [19]:
# First need to make a version of the abundance table merged w/ dysbiosis scores

abundance = pd.read_csv('/Volumes/PGH-Backup/ibd_data/humann_second_run/ibd_genefamilies_relab_clustered.tsv', sep='\t')

dysbiosis = pd.read_csv('/Volumes/PGH-Backup/ibd_data/metadata/dysbiosis_scores.tsv', sep='\t', 
                        names=['sample_id', 'dysbiosis_score', 'dysbiosis_category'])

# print(dysbiosis.head())

print(abundance.shape, dysbiosis.shape)


(1337, 1502) (1595, 3)


In [20]:
abundance_merged = abundance.merge(dysbiosis, left_on='sample_id', right_on='sample_id')

abundance_merged.drop(['Unnamed: 0', 'dysbiosis_category'], axis=1, inplace=True)

# print(abundance_merged.shape)

display(abundance_merged)

Unnamed: 0,DL-endopeptidase-A0A0B0HUJ2,DL-endopeptidase-A0A0P8W7Z2,DL-endopeptidase-A0A132I0H7,DL-endopeptidase-A0A174CGJ4,DL-endopeptidase-A0A174M4B0,DL-endopeptidase-A0A174TDS3,DL-endopeptidase-A0A1C5KMM8,DL-endopeptidase-A0A1C5KNH6,DL-endopeptidase-A0A1C5PVP3,DL-endopeptidase-A0A1C5Q6A7,...,Muramidase-T0WIP7,Muramidase-U2ECY2,Muramidase-U7D9Q6,Muramidase-V1GVJ2,Muramidase-V5EDU1,Muramidase-W4UCM7,Muramidase-W5XD20,Muramidase-W9H589,sample_id,dysbiosis_score
0,0.000000,0.000000e+00,8.383020e-07,0.000000,1.323079e-06,0.000000,0.000004,0.000003,0.000000e+00,0.000000e+00,...,0.0,0.000000,0.000145,0.0,0.0,0.0,0.0,0.0,CSM5FZ4M,0.761300
1,0.000001,0.000000e+00,3.478880e-06,0.000002,4.373101e-06,0.000000,0.000018,0.000010,7.443690e-06,0.000000e+00,...,0.0,0.000000,0.000093,0.0,0.0,0.0,0.0,0.0,CSM5MCUO,0.632952
2,0.000000,0.000000e+00,0.000000e+00,0.000000,9.575670e-07,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,...,0.0,0.000000,0.000146,0.0,0.0,0.0,0.0,0.0,CSM5MCVL,0.912606
3,0.000000,0.000000e+00,1.302237e-06,0.000000,1.704135e-06,0.000000,0.000001,0.000000,0.000000e+00,0.000000e+00,...,0.0,0.000000,0.000143,0.0,0.0,0.0,0.0,0.0,CSM5MCVN,0.916895
4,0.000000,0.000000e+00,2.120380e-06,0.000000,4.052097e-06,0.000000,0.000009,0.000008,0.000000e+00,4.906150e-07,...,0.0,0.000000,0.000130,0.0,0.0,0.0,0.0,0.0,CSM5MCW6,0.962751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1293,0.000000,0.000000e+00,2.761959e-06,0.000001,5.375752e-06,0.000003,0.000024,0.000004,0.000000e+00,4.161391e-06,...,0.0,0.000000,0.000117,0.0,0.0,0.0,0.0,0.0,PSMB4MBS,0.732561
1294,0.000000,3.981110e-07,2.111380e-06,0.000000,2.358866e-06,0.000000,0.000035,0.000004,9.643330e-07,1.218340e-06,...,0.0,0.000000,0.000126,0.0,0.0,0.0,0.0,0.0,PSMB4MC1,0.691453
1295,0.000000,0.000000e+00,9.060580e-07,0.000000,0.000000e+00,0.000000,0.000023,0.000000,2.045580e-06,0.000000e+00,...,0.0,0.000000,0.000146,0.0,0.0,0.0,0.0,0.0,PSMB4MC3,0.719423
1296,0.000000,0.000000e+00,0.000000e+00,0.000000,8.428045e-06,0.000004,0.000051,0.000006,1.476350e-05,0.000000e+00,...,0.0,0.000002,0.000083,0.0,0.0,0.0,0.0,0.0,PSMB4MC5,0.722597


In [21]:
abundance_merged.to_csv('/Volumes/PGH-Backup/ibd_data/humann_second_run/ibd_genefamilies_relab_clustered_dysbiosis.tsv', sep='\t', index=False)

In [11]:
def load_data(file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path, sep='\t')
    return df

In [12]:
def calculate_correlations(df, gene_columns, score_column):
    spearman_results = {}
    pearson_results = {}
    
    for gene in gene_columns:
        spearman_corr, spearman_p = spearmanr(df[gene], df[score_column])
        pearson_corr, pearson_p = pearsonr(df[gene], df[score_column])
        
        spearman_results[gene] = (spearman_corr, spearman_p)
        pearson_results[gene] = (pearson_corr, pearson_p)
    
    return spearman_results, pearson_results

In [13]:
def plot_correlations(correlation_results, method):
    genes = list(correlation_results.keys())
    corrs = [correlation_results[gene][0] for gene in genes]
    pvals = [correlation_results[gene][1] for gene in genes]
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=genes, y=corrs, size=-np.log10(pvals), legend=False, sizes=(20, 200))
    plt.axhline(0, linestyle='--', color='grey')
    plt.xlabel('Gene')
    plt.ylabel(f'{method} Correlation')
    plt.title(f'{method} Correlation of Gene Counts with Dysbiosis Scores')
    plt.xticks(rotation=90)
    plt.show()

In [14]:
def main():
    parser = argparse.ArgumentParser(description="Calculate correlations between gene counts and dysbiosis scores")
    parser.add_argument('file_path', type=str, help="Path to the TSV file containing the data")
    parser.add_argument('--score_column', type=str, default='dysbiosis_score', help="Column name for dysbiosis scores")
    args = parser.parse_args()
    
    # Load the data
    df = load_data(args.file_path)
    
    # Identify gene columns (assuming all columns except the score column are gene counts)
    gene_columns = [col for col in df.columns if col != args.score_column]
    
    # Calculate correlations
    spearman_results, pearson_results = calculate_correlations(df, gene_columns, args.score_column)
    
    # Plot the results
    plot_correlations(spearman_results, 'Spearman')
    plot_correlations(pearson_results, 'Pearson')

# if __name__ == "__main__":
#     main()