# Summary statistics

In [1]:
#Computation
import warnings
import numpy as np
import pandas as pd
import sklearn as skl
import scipy as sp
import re
import math
import os
import time
from datetime import datetime
import gc

from sklearn import metrics
from sklearn.mixture import GaussianMixture
from collections import defaultdict
from tqdm import tqdm

import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.tools import add_constant
from statsmodels.stats.multitest import multipletests
from Bio import SeqIO

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import ScalarFormatter
from adjustText import adjust_text

#Style
sns.set(style="white")
sns.set_palette('Paired')
%matplotlib inline

In [2]:
folder_name = "data_processing_240525_1828"

# read computed files
full_path = os.path.join("data_computed", folder_name)
dat_omics = pd.read_csv(os.path.join(full_path, "omics.txt"), sep="\t")
dat_lrt = pd.read_csv(os.path.join(full_path, "lrt.txt"), sep="\t")
dat_vep_omics = pd.read_csv(os.path.join(full_path, "vep_omics.txt"), sep="\t")
df_batch_counts = pd.read_csv(os.path.join(full_path, "batch_counts.txt"), sep="\t")

In [3]:
# function to read parameters used for data processing from file
def read_parameter(parameter):
    with open(os.path.join(full_path, "parameters.txt"), 'r') as f:
        for line in f:
            # Check if 'min_pdockq' is in the line
            if parameter in line:
                # Split the line on ': ' and strip any whitespace or newline characters
                _, value = line.split(': ')
                # Convert the value to float
                return float(value.strip())

In [4]:
min_pdockq = read_parameter("min_pdockq")
correlation_cutoff = read_parameter("correlation_cutoff")
fdr_alpha = read_parameter("fdr_alpha")


print("-------------------------------------------------")
print("Current parameter values:")
print(f"Min. pDockQ: {min_pdockq}")
print(f"Min. RNA-Px correlation: {correlation_cutoff}")
print(f"FDR alpha: {fdr_alpha}")
print("-------------------------------------------------")

-------------------------------------------------
Current parameter values:
Min. pDockQ: 0.23
Min. RNA-Px correlation: -1.0
FDR alpha: 0.01
-------------------------------------------------


In [5]:
#-------------------------------------------------------- PARAMETER --------------------------------------------------------#
# re-define correlation_cutoff for analysis
correlation_cutoff = 0.3
#---------------------------------------------------------------------------------------------------------------------------#

## STABILITY

In [6]:
n_samples = dat_omics['sample'].nunique()
n_genes = dat_omics['gene'].nunique()
n_batches = dat_omics['batch'].nunique()

mutations_per_sample = dat_vep_omics.groupby('sample').size().reset_index(name='mutations_count')
median_ms_mutations_per_sample = mutations_per_sample['mutations_count'].median()

In [7]:
print(f'Number of multi-omics samples: {n_samples}')
print(f'Number of genes with >25% multi-omics coverage: {n_genes}')
print(f'Median missense mutations per sample: {int(median_ms_mutations_per_sample)}')
print(f'Number of tumor types: {n_batches}')

Number of multi-omics samples: 644
Number of genes with >25% multi-omics coverage: 8791
Median missense mutations per sample: 17
Number of tumor types: 6


In [8]:
n_genes_before_correlation_cutoff = dat_vep_omics['gene'].nunique()

# Only consider genes with some level of RNA-protein concordance
dat_vep_omics = dat_vep_omics[dat_vep_omics['corr_value_tx_value_px']>correlation_cutoff]

n_genes_after_correlation_cutoff = dat_vep_omics['gene'].nunique()

n_genes_discarded = n_genes_before_correlation_cutoff - n_genes_after_correlation_cutoff

print(f"Number of genes discarded for VEP/stability analysis due to low RNA-Px correlation: {n_genes_discarded}")

Number of genes discarded for VEP/stability analysis due to low RNA-Px correlation: 1465


In [9]:
#Drop the few "stabilizing" FoldX calls
dat_vep_omics = dat_vep_omics[dat_vep_omics['FoldX_cat']!='stabilizing']

# Drop cases with incomplete FoldX/AlphaMissense categorization, count categories
dat_vep_cats = dat_vep_omics[['FoldX_cat', 'AM_cat']].dropna(axis=0)
vep_counts = dat_vep_cats.groupby(['FoldX_cat', 'AM_cat']).size().reset_index(name='count')
n_variants_analysed = vep_counts['count'].sum()
print(f"Number of variants analysed: {n_variants_analysed}")

Number of variants analysed: 11802


In [10]:
destabilizing_variants = vep_counts[vep_counts['FoldX_cat']=='destabilizing']['count'].sum()
pathogenic_variants = vep_counts[vep_counts['AM_cat']=='pathogenic']['count'].sum()

pathogenic_and_destabilizing = vep_counts[(vep_counts['AM_cat']=='pathogenic')&(vep_counts['FoldX_cat']=='destabilizing')]['count'].iloc[0]

pathogenic_given_destabilizing = 100*pathogenic_and_destabilizing/destabilizing_variants
destabilizing_given_pathogenic = 100*pathogenic_and_destabilizing/pathogenic_variants

In [11]:
print(f'P(pathogenic|destabilizing) = {round(pathogenic_given_destabilizing, 1)}%')
print(f'P(destabilizing|pathogenic) = {round(destabilizing_given_pathogenic, 1)}%')

P(pathogenic|destabilizing) = 69.4%
P(destabilizing|pathogenic) = 42.9%


## pQTLs

In [12]:
dat_lrt_copy = dat_lrt.copy()

dat_lrt_copy['beta_sig'] = dat_lrt_copy.apply(lambda row:
                                              f'p < {fdr_alpha} / CNV Beta > 0' if (row['p_LRT_adj'] < fdr_alpha and row['param_value_cnv_B'] > 0) else
                                              f'n.s. / CNV Beta > 0' if (row['p_LRT_adj'] > fdr_alpha and row['param_value_cnv_B'] > 0) else
                                              f'p < {fdr_alpha} / CNV Beta < 0' if (row['p_LRT_adj'] < fdr_alpha and row['param_value_cnv_B'] < 0) else 
                                              f'n.s. / CNV Beta < 0', axis=1)

order_list = [f'p < {fdr_alpha} / CNV Beta > 0', f'n.s. / CNV Beta > 0', f'p < {fdr_alpha} / CNV Beta < 0', f'n.s. / CNV Beta < 0']

n_pos_ns = dat_lrt_copy[dat_lrt_copy['beta_sig'] == f'n.s. / CNV Beta > 0'].shape[0]
n_pos_s = dat_lrt_copy[dat_lrt_copy['beta_sig'] == f'p < {fdr_alpha} / CNV Beta > 0'].shape[0]
n_neg_ns = dat_lrt_copy[dat_lrt_copy['beta_sig'] == f'n.s. / CNV Beta < 0'].shape[0]
n_neg_s = dat_lrt_copy[dat_lrt_copy['beta_sig'] == f'p < {fdr_alpha} / CNV Beta < 0'].shape[0]

In [13]:
print(f"Number of interactions with controlling/controlled behavior at FDR {fdr_alpha} : {n_pos_s}")
print(f"Of all interactions with a significant CNV association, {round(100*n_pos_s/(n_pos_s+n_neg_s), 1)} had a positive Beta")

Number of interactions with controlling/controlled behavior at FDR 0.01 : 727
Of all interactions with a significant CNV association, 82.0 had a positive Beta
