In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from pathlib import Path
import shutil
import configparser
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from pylab import *
import numpy as np

# load environment variables
from dotenv import load_dotenv, find_dotenv
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#from sysmex.perturbations import Perturbation
from blood_response.utils import lz4_load
from blood_response.fujiplot import prepare_fuji_input

# read in top pval files

In [None]:
analyses_root = '/mnt/obi0/phi/gwas/gwas_analyses/sysmex_custom_gates_v9-obi2020_10_28/'

In [None]:
# # read in the top pvalues for all perturbations and channels
# pval_cutoff = '1e-7'
# all_pvals_concat = list()
# for ptb_path in tqdm(list((Path(analyses_root) / 'gwas/').glob('*/*/*'))):
#     channel = ptb_path.parent.parent.name
#     dataset = ptb_path.parent.name
#     ptb_name = ptb_path.name
#     for phenotype_pvals in tqdm(list(ptb_path.glob(f'*toppvals.{pval_cutoff}.tsv')), leave=False):
#         covariates, trait = phenotype_pvals.name.split('.')[0:2]
#         pvals = pd.read_csv(phenotype_pvals, sep='\t')
#         pvals['channel'] = channel
#         pvals['dataset'] = dataset
#         pvals['ptb_name'] = ptb_name
#         pvals['gwas_dataset'] = 'obi2020_10_28'
#         pvals['covariates'] = covariates
#         pvals['trait'] = trait
#         all_pvals_concat.append(pvals)

#all_pvals_concat_df.to_parquet(f'{analyses_root}/toppvals.with_nigericin_ret.1e-7.parquet')

# # drop the RET channel results for Nigericin, since they had poor gate alignment
# # 004, 005, 006, 021, 035, 037
# all_pvals_concat_df = all_pvals_concat_df.loc[~(
#     (all_pvals_concat_df.channel == 'ret') &
#     (all_pvals_concat_df.trait.str.contains('RBC1|RBC2|RET1|RET2')) &
#     (all_pvals_concat_df.ptb_name.str.contains('Nigericin'))
# )]
# all_pvals_concat_df.to_parquet(f'{analyses_root}/toppvals.1e-7.parquet')

In [None]:
all_pvals_concat_df = pd.read_parquet(f'{analyses_root}/toppvals.1e-7.parquet')

chr_counts = all_pvals_concat_df.loc[all_pvals_concat_df.P <= 1e-7].\
groupby(['dataset', 'channel', 'ptb_name', 'trait', 'covariates'])['#CHROM'].\
nunique().reset_index(name='chr_ct')

# a lot of DB, NRBC, UK hits scattered across chromosomes - not using these
#chr_counts.sort_values('chr_ct').tail(n=50)

all_pvals_concat_df = all_pvals_concat_df.merge(chr_counts)
all_pvals_concat_df['projected'] = all_pvals_concat_df.trait.str.contains('umap|pica|pca')

all_pvals_concat_q = all_pvals_concat_df.loc[all_pvals_concat_df.trait.str.endswith('-q')]
gated_pvals_concat = all_pvals_concat_df.loc[~all_pvals_concat_df.projected]
gated_pvals_concat_q = all_pvals_concat_df.loc[all_pvals_concat_df.trait.str.endswith('-q') &
                                               ~all_pvals_concat_df.projected]
projected_pvals_concat = all_pvals_concat_df.loc[all_pvals_concat_df.projected]
projected_pvals_concat_q = all_pvals_concat_df.loc[all_pvals_concat_df.trait.str.endswith('-q') &
                                                   all_pvals_concat_df.projected]

# drop any trait with less than 250 samples
gated_pvals_concat_q = gated_pvals_concat_q.loc[gated_pvals_concat_q.OBS_CT >= 250]

# write out a files of hits meeting minimum filtering criteria
hits = all_pvals_concat_df.\
loc[all_pvals_concat_df.P <= 5e-8].\
groupby(['dataset', 'channel', 'ptb_name', 'trait', 'covariates']).\
size().reset_index(name='ct')

### clump across trait files

In [None]:

!mkdir -p {analyses_root}/clumping/gated_quantile_2307/

In [None]:
def clump_pvals(pval_concat, name):
    # remove traits that are very often only noise (Ghost/debris)
    pval_concat_filtered_df = pval_concat.loc[~pval_concat.trait.str.match('Ghost|DB1|DB2|NRBC')]

    # this clumps and annotates all hits (quantile and normal traits)
    for trait in tqdm(pval_concat.trait.unique()):
        pval_concat.loc[pval_concat.trait == trait].\
        rename(columns={
            '#CHROM': 'CHR',
            'POS': 'BP',
            'ID': 'SNP'
        }). \
        to_csv(f'{analyses_root}/clumping/{name}/{trait}.assoc', sep='\t', index=False)

    trait_files = ','.join([x + '.assoc' for x in pval_concat_filtered_df.trait.unique()])

    # clump all results separately
    ! cd {analyses_root}/clumping/{name}; plink --bfile /mnt/obi0/phi/gwas/obi_gwas/runs/2020_10_28/imputed_plus_biobank/imputed_merged --clump-range /mnt/obi0/phi/gwas/misc_data/plink_annotations/glist-hg19 --clump-p1 0.00000005 --clump-p2 0.00000005 --clump-replicate --clump {trait_files} --out clumped_5e8
    clumped = pd.read_csv(f'{analyses_root}/clumping/{name}/clumped_5e8.clumped', delim_whitespace=True).rename(columns={'P': 'P_clump'})
    clumped_ranges = pd.read_csv(f'{analyses_root}/clumping/{name}/clumped_5e8.clumped.ranges', delim_whitespace=True).\
    rename(columns={'P': 'P_range', 'N': 'N_range', 'POS': 'POS_range'})
    clumped = clumped.merge(clumped_ranges)
    annotated_clumps = pval_concat_filtered_df. \
    merge(clumped, left_on='ID', right_on='SNP')
    annotated_clumps.to_excel(f'{analyses_root}/clumps_{name}_long.xlsx')
    
    snp_traits = pval_concat_filtered_df.groupby('ID').traitname.apply(list).reset_index()
    clumps_df = clumped.merge(snp_traits, left_on='SNP', right_on='ID')
    obs_cts = gated_pvals_concat_q.sort_values('P')[['ID', 'OBS_CT']].groupby('ID').head(1).reset_index()
    clumps_df = clumps_df.merge(obs_cts)
    clumps_df.sort_values(['CHR', 'BP']).to_csv(f'{analyses_root}/clumps_{name}.tsv', sep='\t', index=False)
    clumps_df.sort_values(['CHR', 'BP']).to_excel(f'{analyses_root}/clumps_{name}.xlsx')
    
    return clumps_df, annotated_clumps

In [None]:
clumps_gated_quantile, annotated_clumps_gated_quantile = clump_pvals(gated_pvals_concat_q, 'gated_quantile_2307')

In [None]:
# Total genotyping rate is 0.977969.
# 8658943 variants and 3283 people pass filters and QC.
# Note: No phenotypes present.
# --clump: 119 clumps formed from 3502 top variants.
# Results written to clumped_5e8.clumped .
# --clump-range: Clump/region overlaps reported in clumped_5e8.clumped.ranges .

In [None]:
! mkdir -p {analyses_root}/circos/

In [None]:
# replace genes to define ranges (DEFA1-DEFA10P), or merge separate labels (RNASE2,RNASE3)
# also add | characters when long label sets should be split into separate annotations
# then use the split_genes='|' parameter to add additional rows
rename_genes = {
    'SLC38A3': 'GNAT1,SEMA3F,SLC38A3',
    'CACNA2D3': 'CACNA2D3,ARHGEF3',
    'ARHGEF3': 'CACNA2D3,ARHGEF3',
    'TLR10': 'TLR1,TLR6,TLR10',
    'FAM114A1,MIR574,TLR6': 'FAM114A1,MIR574',
    'FAM114A1,MIR574,TLR1,TLR6,TLR10': 'FAM114A1,MIR574|TLR1,TLR6,TLR10',
    'FAM114A1': 'FAM114A1,MIR574',
    'CASP3': 'CASP3,CENPU,PRIMPOL',
    'ACSL1': 'ACSL1,SLED1',
    'DFNA5': 'DFNA5,OSBPL3',
    'OSBPL3': 'DFNA5,OSBPL3',
    'RNASE2': 'RNASE2,RNASE3',
    'RNASE3': 'RNASE2,RNASE3',
    'HFE,HIST1H1A,HIST1H1C,HIST1H2AB,HIST1H2BB,HIST1H3A,HIST1H3B,HIST1H3C,HIST1H4A,HIST1H4B,SLC17A2,SLC17A3,TRIM38':
    'HFE|HIST1H1A-HIST1H4B|SLC17A2,SLC17A3,TRIM38',
    'DEFA1,DEFA1B,DEFA8P,DEFA9P,DEFA10P': 'DEFA1-DEFA10P',
    'DEFA8P': 'DEFA1-DEFA10P',
    'DEFA10P': 'DEFA1-DEFA10P',
    'CYB561D2,GNAI2,HYAL1,HYAL2,HYAL3,IFRD2,LSMEM2,MIR6872,NAT6,NPRL2,RASSF1,RASSF1-AS1,SEMA3B,SEMA3B-AS1,TUSC2,ZMYND10':
    'CYB561D2,GNAI2,HYAL1-3|IFRD2,LSMEM2,MIR6872|NAT6,NPRL2,RASSF1|SEMA3B,TUSC2,ZMYND10',
    'CDHR4,FAM212A,IP6K1,MIR5193,UBA7': 'CDHR4,FAM212A|IP6K1,MIR5193,UBA7'
}

In [None]:
# sorting by p-value and showing top hits per locus and cell_type / condition
cell_gated_quantile_traitlist_p, cell_gated_quantile_input_p, _  = prepare_fuji_input(
    plot_clumps = annotated_clumps_gated_quantile,
    category = 'cell_type',
    filename = 'gated_quantile_all_joint_pvalue_2307',
    analyses_root = analyses_root,
    empty_ranges=True,
    split_genes='|',
    rename_genes=rename_genes,
    sort_traits='pvalue'
)

In [None]:
!cd ~/projects/fujiplot; /usr/bin/Rscript fujiplot.R {analyses_root}/circos/input_cell_type_gated_quantile_all_joint_pvalue_2307.txt {analyses_root}/circos/traitlist_cell_type_gated_quantile_all_joint_pvalue_2307.txt {analyses_root}/circos/output/gated_quantile_all_joint_pvalue_2307

In [None]:
ptb_gated_quantile_traitlist_p, ptb_gated_quantile_input_p, _ = prepare_fuji_input(
    plot_clumps = annotated_clumps_gated_quantile,
    category = 'ptb_name',
    filename = 'gated_quantile_all_joint_pvalue_2307',
    analyses_root = analyses_root,
    empty_ranges=True,
    rename_genes=rename_genes,
    split_genes='|',
    sort_traits='pvalue'
)
_

In [None]:
!cd ~/projects/fujiplot_rev; /usr/bin/Rscript fujiplot.R {analyses_root}/circos/input_ptb_name_gated_quantile_all_joint_pvalue_2307.txt {analyses_root}/circos/traitlist_ptb_name_gated_quantile_all_joint_pvalue_2307.txt {analyses_root}/circos/output/ptb_name_gated_quantile_all_joint_pvalue_2307

In [None]:
## annotate clumps with information from CADD/SnpEff

In [None]:
clumps_df = \
pd.read_csv(
    '/mnt/obi0/phi/gwas/gwas_analyses/sysmex_custom_gates_v9-obi2020_10_28/clumps_gated_quantile_2206.tsv',
    sep='\t'
)

clumps_df = clumps_df.sort_values(['CHR', 'BP'])

# for each SNP, summarize the top BETA coefficients and p-value
gated_pvals_concat_q['ABS_BETA'] = gated_pvals_concat_q.BETA.abs()

# keep top 3 hits by BETA coeff for each SNP
top_gated_pvals_concat_q = \
gated_pvals_concat_q.\
sort_values(by=['ABS_BETA', 'P'], ascending=[False, True]).\
groupby('ID').\
head(n=5)

# top_gated_pvals_concat_q.\
# loc[~top_gated_pvals_concat_q.trait.str.startswith('NRBC')]\
# .head(n=20)

In [None]:
from biothings_client import get_client
mv = get_client("variant")

from functools import lru_cache

lru_cache(1024)
def get_var_info(row):
    ref, alt = row.SNP.split(':')[-2:]
    var_str = f'chr{row.CHR}:g.{row.BP}{ref}>{alt}'
    var_annot = mv.getvariant(var_str)
    if var_annot is None:
        var_annot = mv.getvariant(f'chr{row.CHR}:g.{row.BP}{alt}>{ref}')
    if var_annot is None:
        print(f'Failed to resolve: {var_str}')
    return var_annot
    

var_annotations = dict()

for i, row in tqdm(clumps_df.iterrows()):
    var_annotations[row.ID] = get_var_info(row)

key_lists = [list(v.keys()) for v in var_annotations.values()]

In [None]:
from glom import *

annot_dict = {}
# subset annotations to fields of interest
for variant in var_annotations.keys():
    
    res = glom(
        var_annotations[variant], 
        {
            'rsid': ('dbsnp.rsid'),
            'cadd_cons': (Coalesce('cadd.consdetail', default=None)),
            'cadd_gene': (Coalesce('cadd.gene', default=None)),
            'snpeff': (Coalesce('snpeff.ann', default=None)),
            'clinvar': (Coalesce('clinvar.rcv', default=None)),
            'grasp': (Coalesce('grasp.publication', default=None)),
            #'gwassnps': (Coalesce('gwassnps', default=None)),
        }
    )
    annot_dict[variant] = res

annotation_df = pd.DataFrame.from_dict(annot_dict, orient='Index')

In [None]:
annotation_df

In [None]:
import requests
import json

lru_cache(1024)
def get_open_targets_variant(x):
    
    query_string = \
    """
    query useSearchToConvertRSIDIntoIDFormat($variantId: String!) {
      search(queryString:$variantId){
        totalVariants
        variants{
          id
        }
      }
    }
    """
    
    #variables = {"variantId": x['rsID']}
    variant_id = x['rsid']
    variables = {"variantId": variant_id}
    base_url = "https://api.genetics.opentargets.org/graphql"
    print(variant_id)
    print(variables)
    r = requests.post(base_url, json={"query": query_string, "variables": variables})
    print(r.status_code)
    api_response = json.loads(r.text)
    #return api_response
    if len(api_response['data']['search']['variants']) > 1:
        print(f'Warning: {variant_id}')
    return api_response['data']['search']['variants'][0]['id']
    

annotation_df['otid'] = annotation_df.apply(get_open_targets_variant, axis=1)

In [None]:
clumps_df_annot = clumps_df.merge(annotation_df, left_on='ID', right_index=True)
clumps_df_annot['open_targets'] = clumps_df_annot.otid.apply(lambda x: f'=HYPERLINK("https://genetics.opentargets.org/variant/{x}", "{x}")')


In [None]:
clumps_df_annot.drop([
    'F', 'CHR', 'BP', 'N_range', 'NSIG', 'S05', 'S01', 'S001', 'S0001', 'P_range',
    'ID', 'otid'
], axis=1).\
rename(columns={
    'TOTAL': 'Count',
    'RANGES': 'Genes',
    'traitname': 'Traits',
    'SNP': 'Lead_SNP',
    'SP2': 'SNPs'
}).\
to_excel(
    '/mnt/obi0/phi/gwas/gwas_analyses/sysmex_custom_gates_v9-obi2020_10_28/clumps_gated_quantile_2206_annotated.xlsx',
    index=False,
)

In [None]:
clumps_annotated_df = clumps_df_annot.drop([
    'F', 'CHR', 'BP', 'N_range', 'NSIG', 'S05', 'S01', 'S001', 'S0001', 'P_range',
    'ID', 'otid'
], axis=1).\
rename(columns={
    'TOTAL': 'Count',
    'RANGES': 'Genes',
    'traitname': 'Traits',
    'SNP': 'Lead_SNP',
    'SP2': 'SNPs'
})

In [None]:
clumps_annotated_df.loc[clumps_annotated_df.P_clump <= 5e-10].\
to_excel(
    '/mnt/obi0/phi/gwas/gwas_analyses/sysmex_custom_gates_v9-obi2020_10_28/clumps_gated_quantile_2206_annotated_5e10.xlsx',
    index=False,
)