In [1]:
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp 
import numpy as np
from scipy import stats

In [2]:
strains = pd.read_csv('mystrains.tsv', sep = '\t')
strains.head(2)

Unnamed: 0,Strain Identifier,Strain name,Evolution Experiment?,Parental Strain,Alternative Strain Name,Full Strain Name After Genome Analysis,ENA Accession,Alternative Genome Source,Reference (PMID),Isolation origin,Year,Phenotype,German BSL level,Notes
0,NT12447,B367,0,,KC329,Escherichia coli B367,,Broad institute Antibiotic resistance collection,,Australian magpie Coffs,2001,Commensal strain,1.0,
1,NT12619,S17,0,,,Escherichia coli S17,PRJNA189395,,23516222.0,Broiler chick with septicemia,2013,Pathogenic strain,2.0,APEC


In [None]:
deg = pd.read_csv('deseqoutput_protein-ids.tsv', sep = '\t', index_col = 0).T
deg.head(3)

gene-id,cds-NP_414543.1,cds-NP_414544.1,cds-NP_414545.1,cds-NP_414546.1,cds-NP_414547.1,cds-NP_414548.1,cds-NP_414549.1,cds-NP_414550.1,cds-NP_414551.1,cds-NP_414552.1,...,cds-NP_418811.2,cds-NP_418812.1,cds-NP_418813.1,cds-NP_418814.1,cds-NP_418815.1,cds-NP_418816.1,cds-NP_418817.1,cds-NP_418818.1,cds-NP_418819.1,cds-NP_418820.1
NT12189,-1.396155,-1.086905,-1.036221,-1.127845,-0.170443,0.15418,0.743706,0.231421,-1.702605,2.666265,...,-0.026376,0.058862,0.081481,0.324293,0.11135,0.357424,3.099915,0.239958,0.529843,0.288563
NT12177,-2.021794,-1.48757,-1.481607,-0.445211,0.23606,-0.659636,-0.199121,0.661353,-0.481898,0.949621,...,-0.682059,-0.349514,-0.762992,-0.336101,-0.088115,0.407631,0.086031,0.108566,0.537271,0.573327
NT12130,-0.956499,-0.586719,-0.499014,-1.345601,0.244311,-0.234788,0.440232,0.620264,-1.095199,0.79631,...,0.57422,0.06133,-0.116296,0.051911,0.330066,0.430785,-0.603411,0.244223,-0.13819,0.10736


In [6]:
path = 'tested_patterns/'
full_table = None

for gene in os.listdir(path):
    if gene.startswith('cds'):
        name = gene.split('-')[1]
        name = 'cds-' + name
        
        #get hashed-patterns used in association analysis
        association = pd.read_csv(path + gene, sep = '\t', index_col = 0, names = ['variant', 'lrt-pvalue', 'beta', 'k-samples'])
       
        #get the representative kmer for the hased pattern
        hashes = pd.read_csv('panfeed-renamed-fixed/' + name + '/kmers_to_hashes.tsv.gz', compression = 'gzip', sep = '\t', index_col = 2)
    
        kmers = pd.read_csv('panfeed-renamed-fixed/' + name + '/kmers.tsv.gz', compression = 'gzip', sep = '\t').set_index('k-mer')
        kmers['strain'] = [k.split('_')[0] for k in kmers['strain']] # rename strains so they match those in the deg, and strain file for downstream analysis
        
        p = pd.merge(association, hashes, left_index = True, right_index = True, how = 'inner')
        
        p2 = kmers[kmers.index.isin(p['k-mer'])]
        
        #get predicted transcription rates for each gene kmer and strain
        file = name + '.tsv.gz'
        rates_files = 'tx-panfeed_230/'
        if file in os.listdir(rates_files):
            rates = pd.read_csv(rates_files + file , sep = '\t', compression = 'gzip')
            rates['strain'] = [r.split('_')[0] for r in rates.strain]
            pr = pd.merge(p2, rates, on = 'strain', how = 'inner')

            #select kmer that position is closest to the position of the predicted promoter and predicted tx_rate
            pr['diff_start'] = pr['contig_start'] - pr['start']
            pr['diff_end'] = pr['contig_end'] - pr['end']

            #Group by 'strain' and deduce the minimum difference between kmers's start/end position and and that of the predicted promoter's start/end position, and then get the maximum predicted tx_rate
            #this will return only tx_rates for strains that contain the gene kmer
            tx = pr.groupby('strain').agg({ 'diff_start': 'min',
                                           'diff_end': 'min',
                                           'Tx_rate': 'max'}).reset_index()
            tx = tx[['strain', 'Tx_rate']]

            #get gene's predicted transcription rate for strains not present tx

            tx2 = rates[~rates.strain.isin(tx.strain)]
            tx2 = tx2.groupby('strain')['Tx_rate'].max().reset_index()
            tx2 = tx2[['strain', 'Tx_rate']]

            #merge the two dataframes to get one single dataframe
            tx_r = pd.concat([tx, tx2]).rename(columns = {'Tx_rate' : gene.split('.tsv')[0]})

            # if tx_r.empty:
            #     continue

            if full_table is None:
                full_table = tx_r
            else:
                full_table = pd.merge(full_table, tx_r, on='strain', how='outer')
full_table       

Unnamed: 0,strain,cds-NP_418030.1-793,cds-NP_417082.1-613,cds-NP_415117.2-501,cds-NP_416726.1-2615,cds-NP_417082.1-607,cds-NP_416801.2-1017,cds-NP_416770.1-4,cds-YP_026226.4-1448,cds-NP_416607.1-2059,...,cds-NP_415537.1-3644,cds-NP_416607.1-2068,cds-NP_416669.1-882,cds-YP_588463.1-2320,cds-NP_415708.1-3261,cds-NP_415592.1-4200,cds-NP_415129.1-35,cds-NP_415592.1-4214,cds-NP_416770.1-9,cds-NP_416834.1-4027
0,NT12001,2461.616157,13023.185487,12976.030829,1166.672266,13023.185487,2603.708627,3025.464313,1518.174637,1989.463318,...,1929.166732,1989.463318,1789.871250,2922.763810,2472.972260,1056.406243,3780.695182,1056.406243,3025.464313,2472.846772
1,NT12002,2461.616157,13023.185487,12976.030829,1166.672266,13023.185487,2603.708627,3025.464313,1518.174637,1989.463318,...,1929.166732,1989.463318,1789.871250,2922.763810,2472.972260,1056.406243,3780.695182,1056.406243,3025.464313,2472.846772
2,NT12004,2461.616157,13023.185487,12976.030829,1166.672266,13023.185487,2603.708627,3025.464313,1518.174637,1989.463318,...,1929.166732,1989.463318,1789.871250,2922.763810,2472.972260,1056.406243,3780.695182,1056.406243,3025.464313,2472.846772
3,NT12012,2461.616157,3000.928338,12976.030829,1166.672266,3000.928338,2651.387448,3025.464313,1677.181399,1989.463318,...,1929.166732,1989.463318,2078.272431,,2472.972260,1056.406243,3780.695182,1056.406243,3025.464313,2472.846772
4,NT12015,2461.616157,3132.884198,12976.030829,,3132.884198,2699.960566,3025.464313,1518.174637,1898.736786,...,1929.166732,1898.736786,2074.629602,,2472.972260,1056.406243,3780.695182,1056.406243,3025.464313,2472.846772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,NT12460,2283.165598,4223.525815,12976.030829,1166.592948,4223.525815,2743.978833,1613.648944,1518.174637,2806.113701,...,1846.500268,2806.113701,2078.272431,,2512.953531,1171.587858,3441.374804,1171.587858,1613.648944,2472.846772
113,NT12533,3771.499213,3684.816251,12976.030829,1166.672266,3684.816251,2743.978833,1663.957144,1518.174637,2851.480967,...,1846.500268,2851.480967,2078.272431,3673.059652,2575.729013,1171.587858,3538.134647,1171.587858,1663.957144,2472.846772
114,NT12552,3771.499213,4061.321551,12976.030829,1166.672266,4061.321551,2699.960566,1663.957144,1518.174637,2851.480967,...,1846.500268,2851.480967,2078.272431,,2512.953531,1171.587858,4338.663347,1171.587858,1663.957144,2472.846772
115,NT12564,2610.289355,3547.091838,12976.030829,1446.183168,3547.091838,2614.730877,3025.464313,1518.174637,2077.672606,...,2386.595383,2077.672606,2175.242841,4364.725987,2512.635765,1095.092977,3627.096143,1095.092977,3025.464313,4034.390926


In [6]:
#correlation between log2Foldchange and predicted transcription rates 
#tx_rates of each divided by that of k12 and then log2 transformed.
tx_rates_filtered = full_table.copy()
df_new = np.log2(tx_rates_filtered/tx_rates_filtered.loc['NT12001'])
df_new.head(4)

Unnamed: 0_level_0,cds-NP_417082.1-613,cds-NP_417082.1-607,cds-NP_414578.2-75,cds-NP_418204.2-940,cds-NP_414880.2-2266,cds-NP_417420.1-2778,cds-NP_418208.1-940,cds-NP_417420.1-2793,cds-NP_417420.1-2787,cds-NP_416773.1-1254,...,cds-NP_418440.1-1100,cds-NP_418554.1-1855,cds-NP_418618.1-1642,cds-NP_418686.1-718,cds-NP_414578.2-87,cds-NP_414578.2-93,cds-NP_414578.2-78,cds-NP_417082.1-622,cds-NP_417082.1-636,cds-YP_588463.1-2320
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NT12001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NT12002,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NT12004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NT12012,-2.117602,-2.117602,0.0,0.318827,-1.464291,1.2e-05,0.0,1.2e-05,1.2e-05,0.0,...,0.152345,0.0,-0.960852,0.476826,0.0,0.0,0.0,-2.117602,-2.117602,


In [8]:
import warnings 

pearson = {}
spearman = {}
regress = {}

def safe_correlation(x, y):
    if len(set(x)) == 1 or len(set(y)) == 1:
        return 0, 1  # Return correlation of 0 with p-value of 1 for constant input
    return sp.stats.pearsonr(x, y)

def safe_regression(x, y):
    if len(set(x)) == 1 or len(set(y)) == 1:
        return [0, 0, 0, 1, 0]  # Return dummy values for constant input
    return stats.linregress(x, y)

for var in df_new.columns:
    g_var = 'cds-' + var.split('-')[1]
    var_df = f_df[f_df['gene_name'] == var]
    samples = set(var_df['k-samples'].str.cat(sep=',').split(','))
    
    if g_var in deg.columns:
        # Merge predicted transcription rate and actual expression
        d = pd.merge(df_new[var], deg[g_var], left_index=True, right_index=True, how='inner').dropna()
        
        # Filter to include only strains with the variant
        d_variant = d[d.index.isin(samples)]
        
        if d_variant.empty:
            print(f"No data points with variant for {var}. Skipping.")
            continue
        
        d_variant.columns = ['Predicted transcription rate', 'log2foldchange']
        
        # Calculate correlations and regression
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            p, pp = safe_correlation(d_variant['Predicted transcription rate'], d_variant['log2foldchange'])
            s, ps = safe_correlation(sp.stats.rankdata(d_variant['Predicted transcription rate']), 
                                     sp.stats.rankdata(d_variant['log2foldchange']))
            slope, intercept, r_value, p_value, std_err = safe_regression(d_variant['Predicted transcription rate'], 
                                                                          d_variant['log2foldchange'])
        
        pearson[var] = round(p, 2), pp
        spearman[var] = round(s, 2), ps
        regress[var] = [round(r_value, 2), p_value]
        
        # Create scatter plot
        plt.figure(figsize=(5, 4))
        plt.rcParams.update({'font.size': 14})
        sns.scatterplot(data=d_variant, x='Predicted transcription rate', y='log2foldchange')
        
        if len(set(d_variant['Predicted transcription rate'])) > 1 and len(set(d_variant['log2foldchange'])) > 1:
            sns.regplot(data=d_variant, x='Predicted transcription rate', y='log2foldchange', 
                        scatter=False, 
                        label=f'r={p:.2f}')
        else:
            plt.plot([], [], color='red', label='Constant input')
        
        plt.ylabel( '$\log_2$(FC)\n(Actual Expression)')
        plt.xlabel('Predicted transcription rate\nrelative to $\it{E. coli}$ K-12')
        plt.title(f"{g_var}")
        plt.tight_layout()
        plt.legend()
        plt.savefig(f'corr-plots_var_present/{var}_variant_only.svg', dpi=300)
        plt.close()

print("Analysis complete. Check the 'corr-plots' directory for the generated plots.")

Analysis complete. Check the 'corr-plots' directory for the generated plots.


Comparing predicted expression rates of strains with the variants vs those without the variants

In [8]:

pearson = {}
spearman = {}
regress = {}

for var in df_new.columns:
    g_var = 'cds-' + var.split('-')[1]
    var_df = f_df[f_df['gene_name'] == var]
    samples = set(var_df['k-samples'].str.cat(sep=',').split(','))
    
    if g_var in deg.columns:
        # Merge predicted transcription rate and actual expression
        d = pd.merge(df_new[var], deg[g_var], left_index=True, right_index=True, how='inner').dropna()
        
        # Add a column to indicate variant presence
        d['variant_present'] = d.index.isin(samples).astype(int)
        d.columns = ['Predicted transcription rate', 'log2foldchange', 'variant_present']
        
        # Separate dataframes for variant present and absent
        d_variant = d[d['variant_present'] == 1]
        d_no_variant = d[d['variant_present'] == 0]
        
        # Calculate correlations and regression for both cases
        for case, data in [("Variant", d_variant), ("No Variant", d_no_variant)]:
            if not data.empty:
                p, pp = sp.stats.pearsonr(data['Predicted transcription rate'], data['log2foldchange'])
                s, ps = sp.stats.spearmanr(data['Predicted transcription rate'], data['log2foldchange'])
                slope, intercept, r_value, p_value, std_err = stats.linregress(data['Predicted transcription rate'], data['log2foldchange'])
                
                pearson[f"{var}_{case}"] = round(p, 2), pp
                spearman[f"{var}_{case}"] = round(s, 2), ps
                regress[f"{var}_{case}"] = [round(r_value, 2), p_value]
        
        # Create scatter plot
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=d, x='Predicted transcription rate', y='log2foldchange', 
                        hue='variant_present', palette={0: 'blue', 1: 'red'}, alpha=0.7)
        
        # Add regression lines for both cases
        for case, data, color in [("Variant", d_variant, 'red'), ("No Variant", d_no_variant, 'blue')]:
            if not data.empty:
                sns.regplot(data=data, x='Predicted transcription rate', y='log2foldchange', 
                            scatter=False, color=color, label=f'{case} (r={pearson[f"{var}_{case}"][0]:.2f}, p={pearson[f"{var}_{case}"][1]:.3f})')
        
        plt.ylabel('log2foldchange (Actual Expression)')
        plt.xlabel('Predicted transcription rate')
        plt.title(f"{g_var}")
        plt.legend(title='Variant present')
        plt.tight_layout()
        plt.savefig(f'corr-plots/{var}_comparison.svg', dpi=300)
        plt.close()

print("Analysis complete. Check the 'corr-plots' directory for the generated plots.")

  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  

Analysis complete. Check the 'corr-plots' directory for the generated plots.


In [9]:
#save computed correlation values to dataframe for downstream analysis
df_p = pd.DataFrame(pearson).T.rename(columns = {0: 'pearson', 1: 'pearson(p-value)'})
df_sp = pd.DataFrame(spearman).T.rename(columns = {0:'spearman', 1:'spearman(p-value)'})
linregress = pd.DataFrame(regress).T.rename(columns = {0: 'r_value', 1: 'regress(p-value)'})    

In [11]:
outcomes = {}
for i in tx_rates.columns:
    name = i + '.tsv'
    a = pd.read_csv('/Users/bamudamaris/ownCloud2/model/ecoli/validation_2024/tested_patterns/' + name,  sep = '\t', index_col = 0, names = ['pattern', 'lrt-pvalue', 'beta', 'k-samples'])
    a['assigned'] = a['lrt-pvalue'].apply(lambda x: 1 if x <= 6.95E-06 else 0)
    outcomes[i] = a['assigned'].values.tolist()
outcome = pd.DataFrame(outcomes).T.rename(columns = {0: 'association'}) 
outcome

Unnamed: 0,association
cds-NP_418030.1-793,0
cds-NP_417082.1-613,0
cds-NP_415117.2-501,0
cds-NP_416726.1-2615,0
cds-NP_417082.1-607,0
...,...
cds-NP_415592.1-4200,0
cds-NP_415129.1-35,0
cds-NP_415592.1-4214,0
cds-NP_416770.1-9,0


In [3]:
#combine dataframes
# prediction = pd.concat([df_p, df_sp, linregress, outcome], axis = 1).dropna()
# prediction['association'] = prediction['association'].astype(int)
prediction = pd.read_csv('asso-correlation-panfeed230-sig.tsv', sep = '\t', index_col = 0)
prediction

Unnamed: 0,pearson,pearson(p-value),spearman,spearman(p-value),r_value,regress(p-value),association,adjusted_p_values
cds-NP_414578.2-75,-0.01,8.981294e-01,0.02,8.151193e-01,-0.01,8.981294e-01,1,9.478283e-01
cds-NP_418204.2-940,-0.04,6.972647e-01,-0.04,6.448540e-01,-0.04,6.972647e-01,1,8.237597e-01
cds-NP_414880.2-2266,0.07,5.270828e-01,0.01,9.302139e-01,0.07,5.270828e-01,1,6.966887e-01
cds-NP_418208.1-940,-0.07,4.455788e-01,-0.06,5.351611e-01,-0.07,4.455788e-01,1,6.387368e-01
cds-NP_416773.1-1254,0.16,1.462598e-01,0.21,5.638126e-02,0.16,1.462598e-01,1,3.137891e-01
...,...,...,...,...,...,...,...,...
cds-NP_418085.2-2294,0.33,4.280707e-02,0.31,5.926000e-02,0.33,4.280707e-02,1,1.465319e-01
cds-NP_417610.1-1604,0.69,9.906809e-18,0.74,4.008327e-21,0.69,9.906809e-18,1,1.889370e-16
cds-NP_418554.1-1855,-0.37,6.530199e-05,-0.21,2.683375e-02,-0.37,6.530199e-05,1,4.712333e-04
cds-NP_418686.1-718,-0.04,7.416543e-01,-0.04,7.280097e-01,-0.04,7.416543e-01,1,8.488176e-01


Focus on significantly associated variants only 

In [4]:
#perform multiple test corrections

import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests

sig = prediction[prediction['association'] == 1]


# Perform multiple test correlation using Benjamini-Hochberg (FDR) correction
sig['adjusted_p_values'] = multipletests(sig['pearson(p-value)'], method='fdr_bh')[1]

#sig.to_csv('asso-correlation-panfeed230-sig.tsv', sep = '\t')

sig


Unnamed: 0,pearson,pearson(p-value),spearman,spearman(p-value),r_value,regress(p-value),association,adjusted_p_values
cds-NP_414578.2-75,-0.01,8.981294e-01,0.02,8.151193e-01,-0.01,8.981294e-01,1,9.478283e-01
cds-NP_418204.2-940,-0.04,6.972647e-01,-0.04,6.448540e-01,-0.04,6.972647e-01,1,8.237597e-01
cds-NP_414880.2-2266,0.07,5.270828e-01,0.01,9.302139e-01,0.07,5.270828e-01,1,6.966887e-01
cds-NP_418208.1-940,-0.07,4.455788e-01,-0.06,5.351611e-01,-0.07,4.455788e-01,1,6.387368e-01
cds-NP_416773.1-1254,0.16,1.462598e-01,0.21,5.638126e-02,0.16,1.462598e-01,1,3.137891e-01
...,...,...,...,...,...,...,...,...
cds-NP_418085.2-2294,0.33,4.280707e-02,0.31,5.926000e-02,0.33,4.280707e-02,1,1.465319e-01
cds-NP_417610.1-1604,0.69,9.906809e-18,0.74,4.008327e-21,0.69,9.906809e-18,1,1.889370e-16
cds-NP_418554.1-1855,-0.37,6.530199e-05,-0.21,2.683375e-02,-0.37,6.530199e-05,1,4.712333e-04
cds-NP_418686.1-718,-0.04,7.416543e-01,-0.04,7.280097e-01,-0.04,7.416543e-01,1,8.488176e-01


In [6]:
thresholds = [0.001,0.002, 0.003, 0.004, 0.005, 0.0001, 0.01, 0.05]  

total_var = len(sig)

for threshold in thresholds:
    significant_correlations = sig[(sig['pearson'] > 0) & (sig['adjusted_p_values'] <= threshold)]
    num_significant = len(significant_correlations)
    percentage = (num_significant / total_var) * 100
    print(f"For threshold {threshold}, number of significant correlations is: {num_significant} ({percentage:.2f}%)")


For threshold 0.001, number of significant correlations is: 22 (8.24%)
For threshold 0.002, number of significant correlations is: 22 (8.24%)
For threshold 0.003, number of significant correlations is: 25 (9.36%)
For threshold 0.004, number of significant correlations is: 25 (9.36%)
For threshold 0.005, number of significant correlations is: 25 (9.36%)
For threshold 0.0001, number of significant correlations is: 21 (7.87%)
For threshold 0.01, number of significant correlations is: 26 (9.74%)
For threshold 0.05, number of significant correlations is: 34 (12.73%)


In [None]:
sig['gene'] = ['cds-' + g.split('-')[1] for g in sig.index]
sig

Unnamed: 0,pearson,pearson(p-value),spearman,spearman(p-value),r_value,regress(p-value),association,adjusted_p_values,gene
cds-NP_414578.2-75,-0.01,8.981294e-01,0.02,8.151193e-01,-0.01,8.981294e-01,1,9.478283e-01,cds-NP_414578.2
cds-NP_418204.2-940,-0.04,6.972647e-01,-0.04,6.448540e-01,-0.04,6.972647e-01,1,8.237597e-01,cds-NP_418204.2
cds-NP_414880.2-2266,0.07,5.270828e-01,0.01,9.302139e-01,0.07,5.270828e-01,1,6.966887e-01,cds-NP_414880.2
cds-NP_418208.1-940,-0.07,4.455788e-01,-0.06,5.351611e-01,-0.07,4.455788e-01,1,6.387368e-01,cds-NP_418208.1
cds-NP_416773.1-1254,0.16,1.462598e-01,0.21,5.638126e-02,0.16,1.462598e-01,1,3.137891e-01,cds-NP_416773.1
...,...,...,...,...,...,...,...,...,...
cds-NP_418085.2-2294,0.33,4.280707e-02,0.31,5.926000e-02,0.33,4.280707e-02,1,1.465319e-01,cds-NP_418085.2
cds-NP_417610.1-1604,0.69,9.906809e-18,0.74,4.008327e-21,0.69,9.906809e-18,1,1.889370e-16,cds-NP_417610.1
cds-NP_418554.1-1855,-0.37,6.530199e-05,-0.21,2.683375e-02,-0.37,6.530199e-05,1,4.712333e-04,cds-NP_418554.1
cds-NP_418686.1-718,-0.04,7.416543e-01,-0.04,7.280097e-01,-0.04,7.416543e-01,1,8.488176e-01,cds-NP_418686.1


In [None]:
data = sig.copy()
data.head(3)

Unnamed: 0,pearson,pearson(p-value),spearman,spearman(p-value),r_value,regress(p-value),association,adjusted_p_values,gene
cds-NP_414578.2-75,-0.01,0.898129,0.02,0.815119,-0.01,0.898129,1,0.947828,cds-NP_414578.2
cds-NP_418204.2-940,-0.04,0.697265,-0.04,0.644854,-0.04,0.697265,1,0.82376,cds-NP_418204.2
cds-NP_414880.2-2266,0.07,0.527083,0.01,0.930214,0.07,0.527083,1,0.696689,cds-NP_414880.2


In [6]:
data = sig.copy()

In [7]:
# Total number of correlations
count = data.shape[0]

# No correlation
no_corr_df = data[data['pearson'] == 0.0]
no_correlation = no_corr_df.shape[0]

# Weak positive (0 < r ≤ 0.2)
weak_pos_df = data[(data['pearson'] > 0.0) & (data['pearson'] <= 0.2)]
weak_pos = weak_pos_df.shape[0]

# Weak negative (-0.2 ≤ r < 0)
weak_neg_df = data[(data['pearson'] < 0.0) & (data['pearson'] >= -0.2)]
weak_neg = weak_neg_df.shape[0]

# Moderate positive (0.2 < r < 0.8)
moderate_pos_df = data[(data['pearson'] > 0.2) & (data['pearson'] < 0.8)]
moderate_pos = moderate_pos_df.shape[0]

# Moderate negative (-0.8 < r < -0.2)
moderate_neg_df = data[(data['pearson'] < -0.2) & (data['pearson'] > -0.8)]
moderate_neg = moderate_neg_df.shape[0]

# Strong positive (r ≥ 0.8)
strong_pos_df = data[data['pearson'] >= 0.8]
strong_pos = strong_pos_df.shape[0]

# Strong negative (r ≤ -0.8)
strong_neg_df = data[data['pearson'] <= -0.8]
strong_neg = strong_neg_df.shape[0]

# Print summary with direction
print(f"No correlation (r = 0.0): {no_correlation} ({no_correlation/count*100:.2f}%)")
print(f"Weak positive (0 < r ≤ 0.2): {weak_pos} ({weak_pos/count*100:.2f}%)")
print(f"Weak negative (-0.2 ≤ r < 0): {weak_neg} ({weak_neg/count*100:.2f}%)")
print(f"Moderate positive (0.2 < r < 0.8): {moderate_pos} ({moderate_pos/count*100:.2f}%)")
print(f"Moderate negative (-0.8 < r < -0.2): {moderate_neg} ({moderate_neg/count*100:.2f}%)")
print(f"Strong positive (r ≥ 0.8): {strong_pos} ({strong_pos/count*100:.2f}%)")
print(f"Strong negative (r ≤ -0.8): {strong_neg} ({strong_neg/count*100:.2f}%)")

# Supervisor’s key interest: total counts >0.2 and < -0.2
pos_corr = data[data['pearson'] > 0.2].shape[0]
neg_corr = data[data['pearson'] < -0.2].shape[0]

print(f"\nCorrelations > 0.2: {pos_corr} ({pos_corr/count*100:.2f}%)")
print(f"Correlations < -0.2: {neg_corr} ({neg_corr/count*100:.2f}%)")


No correlation (r = 0.0): 5 (1.87%)
Weak positive (0 < r ≤ 0.2): 83 (31.09%)
Weak negative (-0.2 ≤ r < 0): 92 (34.46%)
Moderate positive (0.2 < r < 0.8): 39 (14.61%)
Moderate negative (-0.8 < r < -0.2): 41 (15.36%)
Strong positive (r ≥ 0.8): 7 (2.62%)
Strong negative (r ≤ -0.8): 0 (0.00%)

Correlations > 0.2: 46 (17.23%)
Correlations < -0.2: 41 (15.36%)
