## Notebook to browse the detected eQTL result for items of interest

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# parameters
cohort = 'pdbp'
version = 'amppdv1'
visit = 0
tissue = 'wb'

In [None]:
# naming
cohort_version = f'{cohort}.{version}'
cohort_build = f'{cohort}.{tissue}{visit}'

# directories
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/{cohort}'
genos_dir = f'{wrk_dir}/genotypes'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'
gwas_dir = f'{home_dir}/gwas'

# input files
gencode_pkl = f'{home_dir}/amppd/expression/gencode.v29.primary_assembly.annotation.pkl'
eqtl_results_file = f'{results_dir}/{cohort_build}.cis.indep.csv'
meta5_st2_clean_file = f'{gwas_dir}/pd.table_s2.clean.txt'

# output files

# constant values
autosomes = [str(x) for x in list(range(1,23))]
alpha_value = 0.05

# items of interest
genes = ['SNCA', 'LRRK2', 'GBA', 'CHURC1', 'RAB29', 'FBXL5', 'GCH1', 'STX4', 'TTC3', 'GPR65']

#### load the gencode annotations

In [None]:
%%time
gencode_df = pd.read_pickle(gencode_pkl)
# drop the ont and tag columns
discard_cols = gencode_df.columns[(gencode_df.columns.str.startswith('ont:')) |
                                (gencode_df.columns.str.startswith('tag:'))]
gencode_df.drop(columns=discard_cols, inplace=True)
# should only be autosomal but sometimes annotation quirks allow in others, so force
gencode_df = gencode_df.loc[(gencode_df['seqname'].str.startswith('chr')) & 
                           (~gencode_df['seqname'].isin(['chrX','chrY','chrM']))]
# every now and again having problem with mics_RNA genes mapping every chrom so drop
gencode_df = gencode_df.loc[~gencode_df['gene_type'].isin(['misc_RNA'])]
print(gencode_df.shape)
display(gencode_df.head())

#### load the detected eQTL

In [None]:
eqtl_df = pd.read_csv(eqtl_results_file)
print(eqtl_df.shape)
number_genes = len(eqtl_df['phenotype_id'].unique())
print(f'detected eqtl for {number_genes} genes')
eqtl_df.head()

#### grab the max pval_nominal and max pval_beta

In [None]:
max_pvalue = eqtl_df['pval_nominal'].max()
max_pval_beta = eqtl_df['pval_beta'].max()
print(f'max p-value: {max_pvalue} and p-values(beta): {max_pval_beta}')

In [None]:
genes_oi_df = gencode_df.loc[gencode_df['gene_name'].isin(genes)]
print(genes_oi_df.shape)
gene_ids = genes_oi_df['gene_id'].unique()
print(gene_ids)
gene_names = genes_oi_df['gene_name'].unique()
print(gene_names)
genes_oi_df.head()

In [None]:
eqtl_genes_oi_df = eqtl_df.loc[eqtl_df['phenotype_id'].isin(genes_oi_df['gene_id'])]
print(eqtl_genes_oi_df.shape)
print(eqtl_genes_oi_df['phenotype_id'].unique())
print(genes_oi_df.loc[genes_oi_df['gene_id'].isin(eqtl_genes_oi_df['phenotype_id']), 
                      ['gene_name']]['gene_name'].unique())

#### load the variants of interest

In [None]:
variants_oi_df = pd.read_csv(meta5_st2_clean_file, sep='\t')
print(variants_oi_df.shape)
variants_oi_df.head()

#### see if any of the independent significant results happen to be risk independent variants

In [None]:
def check_eqtls_for_variants(eqtl_df, variants_df, gencode_df):
    eqtl_variants_oi_df = eqtl_df.loc[eqtl_df['variant_id'].isin(variants_df['SNP'])]
    print(eqtl_variants_oi_df.shape)
    this_cnt = len(eqtl_variants_oi_df['variant_id'].unique())
    print(f'variants {this_cnt}')
    print(eqtl_variants_oi_df['variant_id'].unique())
    this_cnt = len(eqtl_variants_oi_df['phenotype_id'].unique())
    print(f'genes {this_cnt}')
    oi_genes = gencode_df.loc[gencode_df['gene_id'].isin(eqtl_variants_oi_df['phenotype_id']), 
                              ['gene_name']]['gene_name'].unique()
    print(oi_genes)
    return oi_genes

In [None]:
these_genes = check_eqtls_for_variants(eqtl_df, variants_oi_df, gencode_df)
if len(these_genes) > 0:
    genes = set(genes) | set(these_genes)
    print(genes)

#### now load rest of results and see if risk index variants are eQTL

In [None]:
for chrom in autosomes:
    print(f'checking chromosome {chrom}')
    chrom_eqtl_df = pd.read_parquet(f'{tensorqtl_dir}/{cohort_build}.cis_qtl_pairs.chr{chrom}.parquet')
#     oi_chrom_eqtl_df = chrom_eqtl_df.loc[chrom_eqtl_df['pval_nominal'] < max_pvalue]
    oi_chrom_eqtl_df = chrom_eqtl_df.loc[chrom_eqtl_df['pval_nominal'] < max_pval_beta]
    these_genes = check_eqtls_for_variants(oi_chrom_eqtl_df, variants_oi_df, gencode_df)
#     if len(these_genes) > 0:
#         genes = set(genes) | set(these_genes)
    
print(genes)    

#### check all the genes of interest regardless of detectable eQTL

In [None]:
#plot local manhattan for gene eQTL
def plot_eqtl_manhattan(gene_id, gene_name, gene_chrom, gene_start, gene_stop, eqtl_df):
    print(f'{gene_name} {gene_id}')
    print(f'gene {gene_name} is on {gene_chrom} from {gene_start} to {gene_stop}')

    #pull in all results for the gene from chromosome for all visits
    gene_results_df = eqtl_df.loc[eqtl_df['phenotype_id'] == gene_id]
    print(gene_results_df.shape)

    if not gene_results_df is None and gene_results_df.shape[0] > 0:
#         #get suggestive results counts
#         temp_results_df = gene_results_df.loc[gene_results_df['bh_fdr'] <= alpha_value]
#         print(temp_results_df.shape)

        #create some cleaner data columns for plotting purposes
#         gene_results_df['log10_bh_pvalue'] = np.log10(gene_results_df['bh_fdr'])*-1
        gene_results_df['log10_pvalue'] = np.log10(gene_results_df['pval_nominal'])*-1
        gene_results_df['z_score'] = gene_results_df['slope']/gene_results_df['slope_se']
        gene_results_df['z_score_abs'] = np.abs(gene_results_df['z_score'])

        #now actually do the plotting
        sns.set(style='darkgrid')
#         sns.relplot(x='tss_distance',y='log10_pvalue',size='z_score_abs', \
#                     alpha=.5, palette="dark", height=12, data=gene_results_df)
        sns.relplot(x='pos',y='log10_pvalue',size='z_score_abs', \
                    alpha=.5, palette="dark", height=12, data=gene_results_df)        

        min_y = round(min(gene_results_df['log10_pvalue']))

        plt.plot([gene_start, gene_stop], [min_y, min_y], linewidth=3)
        plt.text(gene_stop+10000,min_y,gene_name,fontsize='small')
#         plt.plot([0, 0], [min_y, min_y], linewidth=3)
#         plt.text(0+10000,min_y,gene_name,fontsize='small')


        plt.title(f'{gene_name} eQTL',fontsize='large') 
        plt.show()

#     plot_out_file_name = f'{WRKDIR}/plink/images/{gene_name}.local_man.png'
#     plt.savefig(plot_out_file_name,format='png',dpi=600,bbox_inches='tight')
    
    return

In [None]:
genes_oi_df = gencode_df.loc[gencode_df['gene_name'].isin(genes)]
print(genes_oi_df.shape)
gene_ids = genes_oi_df['gene_id'].unique()
print(gene_ids)

In [None]:
%%time

for gene_id in gene_ids:
    gene_df = genes_oi_df.loc[genes_oi_df['gene_id'] == gene_id]
    print(gene_df['seqname'].unique()[0])
    chrom = gene_df['seqname'].unique()[0]
    gene_name = gene_df['gene_name'].unique()[0]
    gene_start = gene_df['start'].min()
    gene_stop = gene_df['end'].max()
    # now load the chromosome eqtl results and extract specific gene results
    chrom_eqtl_df = pd.read_parquet(f'{tensorqtl_dir}/{cohort_build}.cis_qtl_pairs.{chrom}.parquet')
    chrom_bim_df = pd.read_csv(f'{genos_dir}/{cohort_version}.{chrom}.bfile.bim', 
                               header=None, sep='\s+')
    chrom_bim_df.columns = ['chr', 'name', 'cm', 'pos', 'a1', 'a2']
    chrom_eqtl_df = chrom_eqtl_df.merge(chrom_bim_df, how='inner', 
                                    left_on='variant_id', right_on='name')    
    print(chrom_eqtl_df.shape)
    gene_eqtl_df = chrom_eqtl_df.loc[chrom_eqtl_df['phenotype_id'] == gene_id]
    print(gene_eqtl_df.shape)
    plot_eqtl_manhattan(gene_id, gene_name, chrom, gene_start, gene_stop, gene_eqtl_df)