In [None]:
import pandas as pd
import os
import numpy as np

### E.coli

In [None]:
genes = pd.read_csv('gene_positions.txt', sep = '\t', index_col  = 'protein_id')
genes.head(3)

In [None]:
deg_all = pd.read_csv('deseqoutput_merged.tsv', sep = '\t', index_col = 0)

deg_all.index = [idx.split('_cds_')[1].split('_')[0] + "_" + idx.split('_cds_')[1].split('_')[1] if '_cds_' in idx and len(idx.split('_cds_')[1].split('_')) > 1 else idx for idx in deg_all.index]


deg_all = pd.merge(deg_all, genes['locus_tag'], left_index = True, right_index = True, how = 'left').set_index('locus_tag')
deg_all = deg_all[~deg_all.index.duplicated(keep='first')]
deg_all.head(2)

In [None]:
#filter DESeq2 outputs by padj and log2FC

path = 'DEG/DESeq2_output/'
strains = os.listdir(path)

full_table = None

for strain in strains:
    if strain.startswith('NT'):
        df = pd.read_csv(path + strain)
        #filter data set those with significant DEGs
        df = df[(df['log2FoldChange'].abs() >=2) & (df['padj'] < 0.0001)]
        #rename columns 
        df = df.rename(columns={'Unnamed: 0':'gene_id', 'log2FoldChange': strain[:-4]})
        #drop all irrelevant columns 
        df = df.set_index('gene_id').drop(columns = ['baseMean', 'lfcSE', 'stat', 'pvalue', 'padj'], axis = 1)
        #loop to merge multiple dataframes
        if full_table is None:
            full_table = df
        else:
            full_table = pd.merge(full_table, df, left_index = True, right_index = True, how = 'outer')
full_table

In [None]:
deg = full_table.copy()
deg.index = [idx.split('_cds_')[1].split('_')[0] + "_" + idx.split('_cds_')[1].split('_')[1] if '_cds_' in idx and len(idx.split('_cds_')[1].split('_')) > 1 else idx for idx in deg.index]
deg = pd.merge(genes['locus_tag'], deg, left_index=True, right_index=True, how='right')
deg = deg.set_index('locus_tag')
deg = deg.T
deg = deg.loc[:, deg.columns.dropna()]

# Calculate the number of strains that should meet the criteria (5% of total strains)
min_strains = int(0.05 * len(deg.index))

# Filter genes based on log2 fold change criteria
deg_filtered = deg.loc[:, (abs(deg) >= 2).sum() >= min_strains].copy()

# Add the Count row
deg_filtered = deg_filtered.append(pd.Series((abs(deg_filtered) >= 2).sum(), name='Count'))

# Display the resulting DataFrame
deg_filtered.head(3)

In [None]:
#focus on genes that are present in at least 10% of the strains. 

gpa = pd.read_csv('gene_presence_absence.Rtab', sep = '\t', index_col = 0)
gpa.columns = [col.split('_')[0] if col.startswith('NT') else col for col in gpa.columns]

g = genes.copy()
g = g.reset_index().set_index('gene_name')

gpa2 = pd.merge(g['locus_tag'], gpa, left_index = True, right_index = True, how = 'inner').set_index('locus_tag')
gpa2 = gpa2.T
gpa2.loc['total'] = gpa2.sum()
gpa2 = gpa2.loc[:, gpa2.loc['total'] >= 11]

gpa2.head(3)


In [None]:
selected_genes = set(list(deg_filtered.columns)) & set(list(gpa2.columns))
len(selected_genes)

In [None]:
phenotypes = deg_all[deg_all.index.isin(list(selected_genes))]
phenotypes.head(3)

In [None]:
g2 = genes.copy()
g2 = g2.reset_index().set_index('locus_tag')
phenotypes2 = pd.merge(phenotypes, g2['protein_id'], left_index = True, right_index = True, how = 'left').set_index('protein_id')
phenotypes2.index = 'cds-' + phenotypes2.index
phenotypes2.head(3)

### Differentially expressed genes in *E. coli* that are virulent 

In [None]:
patric = pd.read_csv('BVBRC_sp_gene_MG1655.csv', index_col = 'RefSeq Locus Tag' )
patric.head(3)

In [None]:
vir_patric = patric[patric['Property'] == 'Virulence Factor']
vir_deg = vir_patric[vir_patric.index.isin(phenotypes.index)]
print("Virulence differentially expressed genes: ", len(vir_deg))
print("Proportion Virulence differentially expressed genes: ",len(vir_deg)/len(phenotypes) * 100) 

In [None]:
amr_patric = patric[patric['Property'] == 'Antibiotic Resistance']
amr_deg = amr_patric[amr_patric.index.isin(phenotypes.index)]
print("AMR differentially expressed genes: ", len(amr_deg))
print("Proportion of AMR differentially expressed genes: " , len(amr_deg)/len(phenotypes) * 100)

## P. aeruginosa

In [None]:
g = pd.read_csv('PA14_genes.txt', sep = '\t', index_col  = 'protein_id')
g.head(3)

In [None]:
d = pd.read_csv('DEGs_merged.tsv', sep = '\t', index_col = 0)
d.head(3)

In [None]:
path = 'DGE/'
strains = os.listdir(path)

In [None]:
full_table = None

for strain in strains:
    if strain.endswith('.csv'):
        df = pd.read_csv(path + strain)
        #filter data set those with significant DEGs
        df = df[(df['z.stats'].abs() >=2) & (df['q.value'] < 0.0001)]
        #rename columns 
        df = df.rename(columns={'Unnamed: 0':'gene_id', 'z.stats': strain[:-4]})
        #drop all irrelevant columns 
        df = df.set_index('gene_id').drop(columns = ['mu.x', 'mu.y', 'pooled.std.dev', 'p.value', 'q.value'], axis = 1)
        #loop to merge multiple dataframes
        if full_table is None:
            full_table = df
        else:
            full_table = pd.merge(full_table, df, left_index = True, right_index = True, how = 'outer')
ids = []
for identifier in full_table.index:
    temp = identifier.split('_PA14_')[1]
    gene_id = 'PA14_' + temp
    ids.append(gene_id)
full_table.index = ids
full_table

In [None]:
deg = full_table.copy()
deg = deg.T

# number of strains that should meet the criteria (5% of total strains)
min_strains = int(0.05 * len(deg.index))

# Filter genes based on log2 fold change criteria (|log2FC| >= 2 in at least 5% of strains)
deg_filtered = deg.loc[:, (abs(deg) >= 2).sum() >= min_strains].copy()

# Add the Count row showing the number of strains meeting the criteria for each gene
deg_filtered = deg_filtered.append(pd.Series((abs(deg_filtered) >= 2).sum(), name='Count'))

# Display the resulting DataFrame
deg_filtered.head(3)

In [None]:
# Read the GPA file
gpa = pd.read_csv('gene_presence_absence.Rtab', sep='\t', index_col=0)

# Modify column names
gpa.columns = [c.split('-')[1] for c in gpa.columns]

# Transpose the DataFrame
gpa = gpa.T

# Merge with genes DataFrame and set 'locus_tag' as index
gpa = pd.merge(gpa, genes['locus_tag'], left_index=True, right_index=True, how='left').set_index('locus_tag')

# Calculate the total presence for each gene
gpa.loc['total'] = gpa.sum()

# Calculate the minimum number of strains needed to meet the set threshold
min_strains = int(0.10 * len(gpa.index))

# Filter genes based on presence in at least 10% of strains
gpa_filtered = gpa.loc[:, gpa.loc['total'] >= min_strains].copy()

# Transpose the filtered DataFrame back to its original orientation
gpa_filtered = gpa_filtered.T

# Display the resulting DataFrame
gpa_filtered.head(4)

In [None]:
selected_genes = set(list(deg_filtered.columns)) & set(list(gpa_filtered.columns))

In [None]:
phenotypes = deg_all[deg_all.index.isin(list(selected_genes))]
phenotypes.head(3)

In [None]:
genes2 = genes.copy().reset_index().set_index('locus_tag')
genes2.head(3)

In [None]:
phenotypes2 = pd.merge(phenotypes, genes2['protein_id'], left_index = True, right_index = True, how ='left').set_index('protein_id')
phenotypes2.index = 'cds-' + phenotypes2.index

### Differentially expressed genes in *P. aeruginosa* that are virulent

In [None]:
pheno = phenotypes.copy()
tags = pd.read_csv('tags.txt', sep = '\t', index_col = 0)
pheno = pd.merge(pheno, tags, left_index = True, right_index = True, how = 'left').set_index('old_tag')
pheno

In [None]:
patric = pd.read_csv('BVBRC_sp_gene_pa.csv', index_col = 'RefSeq Locus Tag' )
patric.head(3)

In [None]:
vir_patric = patric[patric['Property'] == 'Virulence Factor']
vir_deg = vir_patric[vir_patric.index.isin(pheno.index)]
vir_deg

In [None]:
print("Virulence differentially expressed genes: ", len(vir_deg))
print("Proportion Virulence differentially expressed genes: ",len(vir_deg)/len(phenotypes) * 100) 

In [None]:
amr_patric = patric[patric['Property'] == 'Antibiotic Resistance']
amr_deg = amr_patric[amr_patric.index.isin(pheno.index)]

In [None]:
print("AMR differentially expressed genes: ", len(amr_deg))
print("Proportion of AMR differentially expressed genes: " , len(amr_deg)/len(phenotypes) * 100)