## Notebook to run interaction *cis*-eQTL interaction analysis for the cell fractions using [tensorQTL](https://github.com/broadinstitute/tensorqtl)

[Taylor-Weiner, Aguet, et al., Genome Biol. 20:228, 2019.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7)

In [1]:
!date

Mon Jun 21 20:45:10 UTC 2021


#### import libraries and set notebook variables

In [2]:
import pandas as pd
import numpy as np
import torch
import tensorqtl.tensorqtl as tensorqtl
from tensorqtl.tensorqtl import genotypeio, cis, trans
print('PyTorch {}'.format(torch.__version__))
print('Pandas {}'.format(pd.__version__))

import os

import warnings
warnings.filterwarnings('ignore')

# import random
# import numpy as np
# import threading


PyTorch 1.8.0
Pandas 1.2.4


In [3]:
# parameters
cohort = 'biofind'
version = 'amppdv1'
visit = 1
visit_name = 'SVM0_5T1'
tissue = 'wb'

In [5]:
# naming
cohort_version = f'{cohort}.{version}'
cohort_build = f'{cohort}.{tissue}{visit}'
quant_type = 'genes'

# directories
wrk_dir = f'/home/jupyter/{cohort}'
geno_dir = f'{wrk_dir}/genotypes'
expr_dir = f'{wrk_dir}/expression'
info_dir = f'{wrk_dir}/sample_info'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files
expr_bed_file = f'{expr_dir}/{cohort_build}.norm.adj.bed.gz'
bfile_prefix_path = f'{geno_dir}/{cohort_version}.bfile'
assay_covs_files = f'{info_dir}/{cohort}_rna_sample_info.csv'

# output files

# constant values
alpha_value = 0.05

cell_types_dict = {'predicted_Lymphocytes': 'Lymphocytes', 
                   'predicted_Neutrophils': 'Neutrophils',
                   'predicted_Monocytes': 'Monocytes', 
                   'predicted_Basophils': 'Basophils',
                   'predicted_Eosinophils': 'Eosinophils'}

terms_list = ['Basophils', 'Eosinophils', 'Lymphocytes', 'Monocytes', 'Neutrophils']

In [6]:
os.makedirs(tensorqtl_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

#### load phenotypes and covariates (if needed)

In [8]:
%%time

phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expr_bed_file)
covs_df = pd.read_csv(assay_covs_files, index_col=0)
print(f'phenotype_df {phenotype_df.shape}')
print(f'phenotype_pos_df {phenotype_pos_df.shape}')
print(f'covariates_df {covs_df.shape}')
# display(phenotype_df.head())
# display(phenotype_pos_df.head())
# display(covs_df.head())
# display(grs_df.head())

phenotype_df (14700, 1383)
phenotype_pos_df (14700, 2)
covariates_df (3538, 54)
CPU times: user 6.65 s, sys: 171 ms, total: 6.82 s
Wall time: 6.82 s


#### load plink bfiles

In [9]:
%%time

# pr = genotypeio.PlinkReader(bfile_prefix_path, select_samples=phenotype_df.columns)
pr = genotypeio.PlinkReader(bfile_prefix_path)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

Mapping files: 100%|██████████| 3/3 [00:15<00:00,  5.14s/it]


CPU times: user 1min 54s, sys: 28.9 s, total: 2min 23s
Wall time: 1min 20s


In [10]:
print(genotype_df.shape)
# display(genotype_df.head())
print(variant_df.shape)
# display(variant_df.head())

(8442414, 1383)
(8442414, 2)


In [11]:
# tensorQTL says wants plink bfiles, but wants bim chrs to include 'chr'
variant_df['chrom'] = 'chr' + variant_df['chrom']
print(variant_df.shape)
# display(variant_df.head())

(8442414, 2)


#### make sure the pheno and genos have same samples

In [12]:
assay_intersect_samples = set(genotype_df.columns) & set(phenotype_df.columns) 
print(f'intersect {len(assay_intersect_samples)}')
extra_geno_samples = set(genotype_df.columns) - set(phenotype_df.columns)
print(f'number of genotypes samples not in expression {len(extra_geno_samples)}')
extra_expr_samples = set(phenotype_df.columns) - set(genotype_df.columns)
print(f'number of expression samples not in genotypes {len(extra_geno_samples)}')

intersect 1184
number of genotypes samples not in expression 199
number of expression samples not in genotypes 199


#### drop the non-matched samples

In [13]:
genotype_df.drop(columns=extra_geno_samples, inplace=True)
phenotype_df.drop(columns=extra_expr_samples, inplace=True)

print(genotype_df.shape)
# display(genotype_df.head())
print(phenotype_df.shape)
# display(phenotype_df.head())

(8442414, 1184)
(14700, 1184)


#### need to make sure phenos and genos have matched chromosomes; ie just autosomes

In [14]:
# need to ditch any non-autosomal genes
assay_intersect_chroms = set(phenotype_pos_df['chr']) & set(variant_df['chrom']) 
print(f'intersect {len(assay_intersect_chroms)}')
extra_geno_chroms = set(variant_df['chrom']) - set(phenotype_pos_df['chr'])
print(f'number of genotypes chroms not in expression {len(extra_geno_chroms)}')
print(extra_geno_chroms)
extra_expr_chroms = set(phenotype_pos_df['chr']) - set(variant_df['chrom'])
print(f'number of expression chroms not in genotypes {len(extra_expr_chroms)}')
print(extra_expr_chroms)

intersect 22
number of genotypes chroms not in expression 0
set()
number of expression chroms not in genotypes 3
{'chrY', 'chrX', 'chrM'}


In [15]:
if len(extra_geno_chroms) > 0:
    variant_df = variant_df.loc[~variant_df['chrom'].isin(extra_geno_chroms)]
    # this will remove variants so need to remove them from genos df as well
    genotype_df = genotype_df.loc[genotype_df.index.isin(variant_df.index)]
if len(extra_expr_chroms) > 0:
    phenotype_pos_df = phenotype_pos_df.loc[~phenotype_pos_df['chr'].isin(extra_expr_chroms)]
    # this will remove genes so need to remove them from phenos df as well
    phenotype_df = phenotype_df.loc[phenotype_df.index.isin(phenotype_pos_df.index)]

print(genotype_df.shape)
# display(genotype_df.head())
print(variant_df.shape)
# display(variant_df.head())
print(phenotype_df.shape)
# display(phenotype_df.head())
print(phenotype_pos_df.shape)
# display(phenotype_pos_df.head())

(8442414, 1184)
(8442414, 2)
(14241, 1184)
(14241, 2)


#### make sure covariates match geno and pheno samples

In [16]:
# subest covs to just this 'visit'; ie all differention days covs in file
# also since only interested in cell fractions as the only covariates, subset now
covs_df = covs_df.loc[(covs_df['visit'] == visit_name) & (covs_df['wgsid'].isin(phenotype_df.columns))]
print(covs_df.shape)

cov_intersect_samples = set(phenotype_df.columns) & set(covs_df['wgsid']) 
print(f'intersect {len(cov_intersect_samples)}')
extra_expr_samples = set(phenotype_df.columns) - set(covs_df['wgsid'])
print(f'number of expression samples not in covariates {len(extra_expr_samples)}')
extra_cov_samples = set(covs_df['wgsid']) - set(phenotype_df.columns)
print(f'number of covariate samples not in genotypes {len(extra_cov_samples)}')

(1184, 54)
intersect 1184
number of expression samples not in covariates 0
number of covariate samples not in genotypes 0


#### subset covariates from full cohort to cohort visit
also rename the cell fractions names

In [17]:
covs_df.rename(columns=cell_types_dict, inplace=True)
print(covs_df.shape)
# display(covs_df.head())

(1184, 54)


In [18]:
# covs_columns_to_use = ['wgsid'] + list(cell_types_dict.values())
covs_columns_to_use = ['wgsid'] + terms_list
covs_to_use = covs_df[covs_columns_to_use]
covs_to_use.set_index('wgsid', inplace=True)
# re-order columns to match phenotypes
covs_to_use = covs_to_use.transpose()
covs_to_use = covs_to_use[phenotype_df.columns]
# now transpose back
covs_to_use = covs_to_use.transpose()
print(covs_to_use.shape)
# display(covs_to_use.head())

for term in terms_list:
    covs_to_use[term].fillna(covs_to_use[term].mean(), inplace=True)

(1184, 1)


#### run the term interactions; ie the ieQTL

In [20]:
%%time
for term in terms_list:
    print(term)
    cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, 
                    covariates_df=pd.DataFrame(index=phenotype_df.columns), 
                    interaction_s=covs_to_use[term],
                    prefix=f'{cohort_build}.{term}', output_dir=tensorqtl_dir,
                    run_eigenmt=True, write_top=True, write_stats=True)

Neutrophils
cis-QTL mapping: nominal associations for all variant-phenotype pairs
  * 1184 samples
  * 14241 phenotypes
  * 0 covariates
  * 8442414 variants
  * including interaction term
    * using 0.05 MAF threshold
  * checking phenotypes: 14241/14241
    ** dropping 7 phenotypes without variants in cis-window
  * Computing associations
    Mapping chromosome chr12
    processing phenotype 833/14234    time elapsed: 1.95 min
    * writing output
    Mapping chromosome chr16
    processing phenotype 660/14234    time elapsed: 3.48 min
    * writing output
    Mapping chromosome chr11
    processing phenotype 789/14234    time elapsed: 5.35 min
    * writing output
    Mapping chromosome chr1
    processing phenotype 1437/14234    time elapsed: 8.41 min
    * writing output
    Mapping chromosome chr3
    processing phenotype 833/14234    time elapsed: 10.44 min
    * writing output
    Mapping chromosome chr13
    processing phenotype 285/14234    time elapsed: 11.28 min
    * writ

#### load term interaction top results and save significant results

In [21]:
%%time
for term in terms_list:
    print(term)
    top_file = f'{tensorqtl_dir}/{cohort_build}.{term}.cis_qtl_top_assoc.txt.gz'
    top_results_file = f'{results_dir}/{cohort_build}.{term}.cis_qtl_top_assoc.txt.gz'
    top_ieqtl = pd.read_csv(top_file, sep='\t')
    print(top_ieqtl.shape)
    top_sig_ieqtl = top_ieqtl.loc[top_ieqtl['pval_adj_bh'] <= alpha_value]
    print(top_sig_ieqtl.shape)
    top_sig_ieqtl.to_csv(top_results_file)

Neutrophils
(14233, 18)
(231, 18)
CPU times: user 64.3 ms, sys: 229 µs, total: 64.6 ms
Wall time: 62.3 ms
