## Notebook to run interaction *cis*-eQTL interaction analysis using [tensorQTL](https://github.com/broadinstitute/tensorqtl)

[Taylor-Weiner, Aguet, et al., Genome Biol. 20:228, 2019.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7)

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import numpy as np
import torch
import tensorqtl.tensorqtl as tensorqtl
from tensorqtl.tensorqtl import genotypeio, cis, trans
print('PyTorch {}'.format(torch.__version__))
print('Pandas {}'.format(pd.__version__))

import os

import warnings
warnings.filterwarnings('ignore')

# import random
# import numpy as np
# import threading


In [None]:
# parameters
cohort = 'biofind'
version = 'amppdv1'
visit = 1
visit_name = 'SVM0_5T1'
tissue = 'wb'

In [None]:
# naming
cohort_version = f'{cohort}.{version}'
cohort_build = f'{cohort}.{tissue}{visit}'
quant_type = 'genes'

# directories
wrk_dir = f'/home/jupyter/{cohort}'
geno_dir = f'{wrk_dir}/genotypes'
expr_dir = f'{wrk_dir}/expression'
info_dir = f'{wrk_dir}/sample_info'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files
expr_bed_file = f'{expr_dir}/{cohort_build}.norm.adj.bed.gz'
bfile_prefix_path = f'{geno_dir}/{cohort_version}.bfile'
assay_covs_files = f'{info_dir}/{cohort}_rna_sample_info.csv'
grs_file = f'{info_dir}/{cohort}.grs.scaled.csv'

# output files

# constant values
alpha_value = 0.05

cell_types_dict = {'predicted_Lymphocytes': 'Lymphocytes', 
                   'predicted_Neutrophils': 'Neutrophils',
                   'predicted_Monocytes': 'Monocytes', 
                   'predicted_Basophils': 'Basophils',
                   'predicted_Eosinophils': 'Eosinophils'}

terms_list = ['DX', 'GRS']

In [None]:
os.makedirs(tensorqtl_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

#### load phenotypes and covariates (if needed)

In [None]:
%%time

phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expr_bed_file)
covs_df = pd.read_csv(assay_covs_files, index_col=0)
grs_df = pd.read_csv(grs_file, index_col=0)
print(f'phenotype_df {phenotype_df.shape}')
print(f'phenotype_pos_df {phenotype_pos_df.shape}')
print(f'covariates_df {covs_df.shape}')
print(f'grs_df {grs_df.shape}')
# display(phenotype_df.head())
# display(phenotype_pos_df.head())
# display(covs_df.head())
# display(grs_df.head())

#### load plink bfiles

In [None]:
%%time

# pr = genotypeio.PlinkReader(bfile_prefix_path, select_samples=phenotype_df.columns)
pr = genotypeio.PlinkReader(bfile_prefix_path)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

In [None]:
print(genotype_df.shape)
# display(genotype_df.head())
print(variant_df.shape)
# display(variant_df.head())

In [None]:
# tensorQTL says wants plink bfiles, but wants bim chrs to include 'chr'
variant_df['chrom'] = 'chr' + variant_df['chrom']
print(variant_df.shape)
# display(variant_df.head())

#### make sure the pheno and genos have same samples

In [None]:
assay_intersect_samples = set(genotype_df.columns) & set(phenotype_df.columns) 
print(f'intersect {len(assay_intersect_samples)}')
extra_geno_samples = set(genotype_df.columns) - set(phenotype_df.columns)
print(f'number of genotypes samples not in expression {len(extra_geno_samples)}')
extra_expr_samples = set(phenotype_df.columns) - set(genotype_df.columns)
print(f'number of expression samples not in genotypes {len(extra_geno_samples)}')

#### drop the non-matched samples

In [None]:
genotype_df.drop(columns=extra_geno_samples, inplace=True)
phenotype_df.drop(columns=extra_expr_samples, inplace=True)

print(genotype_df.shape)
# display(genotype_df.head())
print(phenotype_df.shape)
# display(phenotype_df.head())

#### need to make sure phenos and genos have matched chromosomes; ie just autosomes

In [None]:
# need to ditch any non-autosomal genes
assay_intersect_chroms = set(phenotype_pos_df['chr']) & set(variant_df['chrom']) 
print(f'intersect {len(assay_intersect_chroms)}')
extra_geno_chroms = set(variant_df['chrom']) - set(phenotype_pos_df['chr'])
print(f'number of genotypes chroms not in expression {len(extra_geno_chroms)}')
print(extra_geno_chroms)
extra_expr_chroms = set(phenotype_pos_df['chr']) - set(variant_df['chrom'])
print(f'number of expression chroms not in genotypes {len(extra_expr_chroms)}')
print(extra_expr_chroms)

In [None]:
if len(extra_geno_chroms) > 0:
    variant_df = variant_df.loc[~variant_df['chrom'].isin(extra_geno_chroms)]
    # this will remove variants so need to remove them from genos df as well
    genotype_df = genotype_df.loc[genotype_df.index.isin(variant_df.index)]
if len(extra_expr_chroms) > 0:
    phenotype_pos_df = phenotype_pos_df.loc[~phenotype_pos_df['chr'].isin(extra_expr_chroms)]
    # this will remove genes so need to remove them from phenos df as well
    phenotype_df = phenotype_df.loc[phenotype_df.index.isin(phenotype_pos_df.index)]

print(genotype_df.shape)
# display(genotype_df.head())
print(variant_df.shape)
# display(variant_df.head())
print(phenotype_df.shape)
# display(phenotype_df.head())
print(phenotype_pos_df.shape)
# display(phenotype_pos_df.head())

#### make sure covariates match geno and pheno samples

In [None]:
# subest covs to just this 'visit'; ie all differention days covs in file
# also since only interested in cell fractions as the only covariates, subset now
covs_df = covs_df.loc[(covs_df['visit'] == visit_name) & (covs_df['wgsid'].isin(phenotype_df.columns))]
print(covs_df.shape)

cov_intersect_samples = set(phenotype_df.columns) & set(covs_df['wgsid']) 
print(f'intersect {len(cov_intersect_samples)}')
extra_expr_samples = set(phenotype_df.columns) - set(covs_df['wgsid'])
print(f'number of expression samples not in covariates {len(extra_expr_samples)}')
extra_cov_samples = set(covs_df['wgsid']) - set(phenotype_df.columns)
print(f'number of covariate samples not in genotypes {len(extra_cov_samples)}')

#### make sure interaction terms match geno and pheno samples

In [None]:
grs_df = grs_df.loc[grs_df.index.isin(phenotype_df.columns)]
print(grs_df.shape)

terms_intersect_samples = set(phenotype_df.columns) & set(grs_df.index) 
print(f'intersect {len(terms_intersect_samples)}')
extra_expr_samples = set(phenotype_df.columns) - set(grs_df.index)
print(f'number of expression samples not in interaction terms {len(extra_expr_samples)}')
extra_terms_samples = set(grs_df.index) - set(phenotype_df.columns)
print(f'number of interaction terms samples not in genotypes {len(extra_terms_samples)}')

#### subset covariates from full cohort to cohort visit
also rename the cell fractions names

In [None]:
covs_df.rename(columns=cell_types_dict, inplace=True)
print(covs_df.shape)
# display(covs_df.head())

In [None]:
covs_columns_to_use = ['wgsid'] + list(cell_types_dict.values())
covs_to_use = covs_df[covs_columns_to_use]
covs_to_use.set_index('wgsid', inplace=True)
# re-order columns to match phenotypes
covs_to_use = covs_to_use.transpose()
covs_to_use = covs_to_use[phenotype_df.columns]
# now transpose back
covs_to_use = covs_to_use.transpose()
print(covs_to_use.shape)
# display(covs_to_use.head())

#### subset interaction terms from full cohort to cohort visit

In [None]:
print(grs_df.shape)
# display(grs_df.head())

In [None]:
terms_to_use = grs_df[terms_list]
# re-order columns to match phenotypes
terms_to_use = terms_to_use.transpose()
terms_to_use = terms_to_use[phenotype_df.columns]
# now transpose back
terms_to_use = terms_to_use.transpose()
print(terms_to_use.shape)
# display(terms_to_use.head())

#### interaction term for 'DX' has to be converted from categorical into binary
just set 'PD' to one and 'Other or Control' to zero

In [None]:
terms_to_use['DX'] = np.where(terms_to_use['DX'] == 'PD', 1, 0)

In [None]:
# make sure GRS is float, and fill missing
terms_to_use['GRS'] = terms_to_use['GRS'].astype('float32')
terms_to_use['GRS'].fillna(terms_to_use['GRS'].mean(), inplace=True)

#### run the term interactions; ie the ieQTL

In [None]:
%%time
for term in terms_list:
    print(term)
    cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, 
                    covariates_df=covs_to_use, 
                    interaction_s=terms_to_use[term],
                    prefix=f'{cohort_build}.{term}', output_dir=tensorqtl_dir,
                    run_eigenmt=True, write_top=True, write_stats=True)

#### load term interaction top results and save significant results

In [None]:
%%time
for term in terms_list:
    print(term)
    top_file = f'{tensorqtl_dir}/{cohort_build}.{term}.cis_qtl_top_assoc.txt.gz'
    top_results_file = f'{results_dir}/{cohort_build}.{term}.cis_qtl_top_assoc.txt.gz'
    top_ieqtl = pd.read_csv(top_file, sep='\t')
    print(top_ieqtl.shape)
    top_sig_ieqtl = top_ieqtl.loc[top_ieqtl['pval_adj_bh'] <= alpha_value]
    print(top_sig_ieqtl.shape)
    top_sig_ieqtl.to_csv(top_results_file)

#### load all the interaction results for one of the terms

In [None]:
this_term = 'DX'
import dask.dataframe as dd

term_files = f'{tensorqtl_dir}/{cohort_build}.{this_term}.cis_qtl_pairs.chr*.parquet'
ieqtl_df = dd.read_parquet(term_files)

In [None]:
print(ieqtl_df.shape)
display(ieqtl_df.head())

In [None]:
top_file = f'{tensorqtl_dir}/{cohort_build}.{this_term}.cis_qtl_top_assoc.txt.gz'
# biofind.wb1.DX.cis_qtl_top_assoc.txt.gz
# biofind.wb1.GRS.cis_qtl_top_assoc.txt.gz
top_ieqtl = pd.read_csv(top_file, sep='\t')
print(top_ieqtl.shape)
display(top_ieqtl.head())

In [None]:
print(top_ieqtl.loc[top_ieqtl['pval_adj_bh'] < 0.05].shape)
display(top_ieqtl.loc[top_ieqtl['pval_adj_bh'] < 0.05].head())