# Description

It lists the most significant LV-trait associations for asthma, COPD and ACO.

# Modules

In [1]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from IPython.display import HTML

import conf

# Settings

In [2]:
INPUT_DIR = conf.PROJECTS["ASTHMA_COPD"]["RESULTS_DIR"] / "gls_phenoplier"
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/opt/data/projects/asthma-copd/results/gls_phenoplier')

# Load data

## MultiPLIER summary

In [21]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [22]:
multiplier_model_summary.shape

(2157, 5)

In [23]:
multiplier_model_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,KEGG_LYSINE_DEGRADATION,1,0.388059,0.866078,0.956005
2,REACTOME_MRNA_SPLICING,1,0.733057,4.8e-05,0.000582
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628,0.011366
4,KEGG_DNA_REPLICATION,1,0.549473,0.312155,0.539951
5,PID_MYC_ACTIVPATHWAY,1,0.639303,0.021702,0.083739


## GLS associations for Asthma/COPD

In [3]:
input_filepath = INPUT_DIR / "gls-summary.pkl.gz"
assert input_filepath.exists()

In [4]:
gls_asthma_copd = pd.read_pickle(input_filepath)

In [5]:
gls_asthma_copd.shape

(2961, 6)

In [6]:
gls_asthma_copd.head()

Unnamed: 0,lv,beta,beta_se,pvalue,phenotype,fdr
0,LV247,0.667501,0.126759,7.200244e-08,GWAS_COPD_only_GLM_SNPs_info0.7-,0.000107
1,LV300,0.620979,0.125634,3.948175e-07,GWAS_COPD_only_GLM_SNPs_info0.7-,0.000234
2,LV180,0.590544,0.126656,1.592848e-06,GWAS_COPD_only_GLM_SNPs_info0.7-,0.000619
3,LV696,0.49731,0.125236,3.617882e-05,GWAS_COPD_only_GLM_SNPs_info0.7-,0.008927
4,LV504,0.478885,0.126907,8.120896e-05,GWAS_COPD_only_GLM_SNPs_info0.7-,0.017176


## GLS associations for PhenomeXcan

In [7]:
input_filepath = INPUT_DIR / "gls-summary-phenomexcan.pkl.gz"
assert input_filepath.exists()

In [8]:
gls_phenomexcan = pd.read_pickle(input_filepath)

In [9]:
gls_phenomexcan.shape

(4037817, 5)

In [10]:
gls_phenomexcan.head()

Unnamed: 0,phenotype,phenotype_desc,lv,pvalue,fdr
0,AB1_OTHER_VIRAL,Other viral diseases,LV736,0.004725,0.504339
1,AB1_OTHER_VIRAL,Other viral diseases,LV320,0.004848,0.508291
2,AB1_OTHER_VIRAL,Other viral diseases,LV366,0.005306,0.523691
3,AB1_OTHER_VIRAL,Other viral diseases,LV964,0.006106,0.548143
4,AB1_OTHER_VIRAL,Other viral diseases,LV92,0.006565,0.560048


## GLS associations for eMERGE

In [11]:
input_filepath = INPUT_DIR / "gls-summary-emerge.pkl.gz"
assert input_filepath.exists()

In [12]:
gls_emerge = pd.read_pickle(input_filepath)

In [13]:
gls_emerge.shape

(304983, 5)

In [14]:
gls_emerge.head()

Unnamed: 0,phenotype,phenotype_desc,lv,pvalue,fdr
0,EUR_276.14,Hypopotassemia,LV273,0.000191,0.132187
1,EUR_276.14,Hypopotassemia,LV80,0.000616,0.244918
2,EUR_276.14,Hypopotassemia,LV870,0.000752,0.265545
3,EUR_276.14,Hypopotassemia,LV498,0.000914,0.293902
4,EUR_276.14,Hypopotassemia,LV561,0.001037,0.312601


# Top hits

In [15]:
with pd.option_context("display.max_columns", None, "display.max_colwidth", None):
    signif_lv_assocs = gls_asthma_copd.sort_values("fdr")  # .drop(columns="phenotype")
    signif_lv_assocs = signif_lv_assocs[signif_lv_assocs["fdr"] < 0.05]

    # convert back "category" data types into str/object
    signif_lv_assocs["lv"] = signif_lv_assocs["lv"].astype(str)
    signif_lv_assocs["phenotype"] = signif_lv_assocs["phenotype"].astype(str)

    display(signif_lv_assocs.shape)
    display(signif_lv_assocs.head(50))

(29, 6)

Unnamed: 0,lv,beta,beta_se,pvalue,phenotype,fdr
987,LV101,0.725541,0.126975,5.76347e-09,GWAS_Asthma_only_GLM_SNPs_info0.7-,1.7e-05
0,LV247,0.667501,0.126759,7.200244e-08,GWAS_COPD_only_GLM_SNPs_info0.7-,0.000107
989,LV17,0.651945,0.130076,2.764702e-07,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000205
988,LV444,0.626413,0.124808,2.666571e-07,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000205
1,LV300,0.620979,0.125634,3.948175e-07,GWAS_COPD_only_GLM_SNPs_info0.7-,0.000234
1974,LV101,0.617122,0.128485,7.98979e-07,GWAS_ACO_GLM_SNPs_info0.7-,0.000394
2,LV180,0.590544,0.126656,1.592848e-06,GWAS_COPD_only_GLM_SNPs_info0.7-,0.000619
991,LV61,0.58048,0.125812,2.014374e-06,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000619
990,LV948,0.581716,0.125679,1.876877e-06,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000619
992,LV705,0.573511,0.124509,2.090083e-06,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000619


## Hits per phenotype

In [16]:
signif_lv_assocs["phenotype"].value_counts()

GWAS_Asthma_only_GLM_SNPs_info0.7-    13
GWAS_COPD_only_GLM_SNPs_info0.7-      11
GWAS_ACO_GLM_SNPs_info0.7-             5
Name: phenotype, dtype: int64

In [17]:
signif_lv_assocs.groupby("phenotype")["fdr"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
phenotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GWAS_ACO_GLM_SNPs_info0.7-,5.0,0.019712,0.015569,0.000394,0.008927,0.018951,0.033884,0.036405
GWAS_Asthma_only_GLM_SNPs_info0.7-,13.0,0.014394,0.015571,1.7e-05,0.000619,0.009896,0.026477,0.039047
GWAS_COPD_only_GLM_SNPs_info0.7-,11.0,0.020067,0.015464,0.000107,0.004773,0.022873,0.034434,0.039047


In [18]:
for idx, grp in signif_lv_assocs.groupby("phenotype"):
    display(HTML(f"<h3>{idx}</h3>"))
    display(grp)

Unnamed: 0,lv,beta,beta_se,pvalue,phenotype,fdr
1974,LV101,0.617122,0.128485,7.98979e-07,GWAS_ACO_GLM_SNPs_info0.7-,0.000394
1975,LV948,0.503273,0.12641,3.464972e-05,GWAS_ACO_GLM_SNPs_info0.7-,0.008927
1976,LV705,0.466074,0.125974,0.0001088037,GWAS_ACO_GLM_SNPs_info0.7-,0.018951
1977,LV504,0.442245,0.126605,0.0002403098,GWAS_ACO_GLM_SNPs_info0.7-,0.033884
1978,LV563,0.441458,0.129181,0.0003181143,GWAS_ACO_GLM_SNPs_info0.7-,0.036405


Unnamed: 0,lv,beta,beta_se,pvalue,phenotype,fdr
987,LV101,0.725541,0.126975,5.76347e-09,GWAS_Asthma_only_GLM_SNPs_info0.7-,1.7e-05
989,LV17,0.651945,0.130076,2.764702e-07,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000205
988,LV444,0.626413,0.124808,2.666571e-07,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000205
991,LV61,0.58048,0.125812,2.014374e-06,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000619
990,LV948,0.581716,0.125679,1.876877e-06,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000619
992,LV705,0.573511,0.124509,2.090083e-06,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.000619
993,LV96,0.499062,0.127083,4.344623e-05,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.009896
994,LV70,0.467698,0.125898,0.0001025075,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.018951
995,LV844,0.491428,0.13248,0.0001047248,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.018951
996,LV207,0.465095,0.129773,0.000170503,GWAS_Asthma_only_GLM_SNPs_info0.7-,0.026477


Unnamed: 0,lv,beta,beta_se,pvalue,phenotype,fdr
0,LV247,0.667501,0.126759,7.200244e-08,GWAS_COPD_only_GLM_SNPs_info0.7-,0.000107
1,LV300,0.620979,0.125634,3.948175e-07,GWAS_COPD_only_GLM_SNPs_info0.7-,0.000234
2,LV180,0.590544,0.126656,1.592848e-06,GWAS_COPD_only_GLM_SNPs_info0.7-,0.000619
3,LV696,0.49731,0.125236,3.617882e-05,GWAS_COPD_only_GLM_SNPs_info0.7-,0.008927
4,LV504,0.478885,0.126907,8.120896e-05,GWAS_COPD_only_GLM_SNPs_info0.7-,0.017176
5,LV70,0.46243,0.12715,0.0001390485,GWAS_COPD_only_GLM_SNPs_info0.7-,0.022873
6,LV455,0.442878,0.124007,0.0001788404,GWAS_COPD_only_GLM_SNPs_info0.7-,0.026477
8,LV799,0.437309,0.12623,0.0002674683,GWAS_COPD_only_GLM_SNPs_info0.7-,0.034434
7,LV149,0.43346,0.124817,0.0002592485,GWAS_COPD_only_GLM_SNPs_info0.7-,0.034434
9,LV214,0.428885,0.12555,0.000319664,GWAS_COPD_only_GLM_SNPs_info0.7-,0.036405


## Hits per LV

In [19]:
lv_assocs_by_size = signif_lv_assocs.groupby("lv").size().sort_values(ascending=False)
display(lv_assocs_by_size)

lv
LV101    2
LV70     2
LV948    2
LV705    2
LV504    2
LV207    1
LV61     1
LV149    1
LV844    1
LV803    1
LV799    1
LV17     1
LV180    1
LV696    1
LV563    1
LV214    1
LV506    1
LV140    1
LV455    1
LV444    1
LV383    1
LV300    1
LV247    1
LV96     1
dtype: int64

In [20]:
lv_assocs_by_size.shape

(24,)

In [29]:
lv_assocs_ids = set(map(lambda x: x.split("LV")[1], lv_assocs_by_size.index))

In [30]:
len(lv_assocs_ids)

24

In [31]:
display(list(lv_assocs_ids)[:5])

['140', '799', '17', '300', '504']

In [32]:
lv_pathways = multiplier_model_summary[
    multiplier_model_summary["LV index"].isin(lv_assocs_ids)
    & (
        (multiplier_model_summary["FDR"] < 0.05)
        #         | (multiplier_model_summary["AUC"] >= 0.75)
    )
]

In [34]:
lv_pathways.shape

(22, 5)

In [44]:
lv_pathways.sort_values("LV index")

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
441,SVM NK cells activated,101,0.815507,2.739687e-06,4.545773e-05
439,DMAP_NKA3,101,0.824968,4.924234e-05,0.0005967176
438,IRIS_NKcell-IL2stimulated,101,0.768951,0.002282981,0.01478796
591,REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM,140,0.712535,2.407559e-08,6.038493e-07
590,SVM Neutrophils,140,0.732264,0.001615458,0.01131345
592,REACTOME_INTERFERON_GAMMA_SIGNALING,140,0.861331,1.693174e-05,0.000228261
88,DMAP_NKA3,17,0.700227,0.006643279,0.03379611
91,SVM NK cells resting,17,0.805009,6.562786e-06,9.968964e-05
92,KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY,17,0.660631,0.001499019,0.01070657
1032,REACTOME_REGULATION_OF_APOPTOSIS,300,0.741191,0.0007748537,0.006283306


In [46]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    _tmp = lv_pathways.groupby("LV index").apply(lambda x: "; ".join(x.sort_values("p-value")["pathway"].tolist()))
    display(_tmp.to_frame())

Unnamed: 0_level_0,0
LV index,Unnamed: 1_level_1
101,SVM NK cells activated; DMAP_NKA3; IRIS_NKcell-IL2stimulated
140,REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM; REACTOME_INTERFERON_GAMMA_SIGNALING; SVM Neutrophils
17,SVM NK cells resting; KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY; DMAP_NKA3
300,REACTOME_REGULATION_OF_APOPTOSIS
61,SVM Neutrophils; DMAP_ERY5
70,REACTOME_REGULATION_OF_ORNITHINE_DECARBOXYLASE_ODC; MIPS_PA700_20S_PA28_COMPLEX; REACTOME_CDK_MEDIATED_PHOSPHORYLATION_AND_REMOVAL_OF_CDC6; KEGG_PATHOGENIC_ESCHERICHIA_COLI_INFECTION; REACTOME_TRNA_AMINOACYLATION
705,SVM NK cells activated; PID_IL12_2PATHWAY
844,KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION
96,SVM NK cells activated; SVM T cells follicular helper
