In [None]:
### This script refers to the Tensorqtl(https://github.com/broadinstitute/tensorqtl) tutorial.

In [None]:
import pandas as pd
import tensorqtl
from tensorqtl import pgen, cis, trans, post

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"torch: {torch.__version__} (CUDA {torch.version.cuda}), device: {device}")
print(f"pandas {pd.__version__}")

In [None]:
prefix = 'CD4-T'

In [None]:
phenotype_df = pd.read_csv(f"/path/of/expression/{prefix}/{prefix}_herv.csv",sep = "\t",index_col = 0)
phenotype_df

In [None]:
covariates_df = pd.read_csv(f"/path/of/peer_factor/{prefix}_peer_factors_2.tsv",sep = "\t",index_col = 0)
covariates_df = covariates_df.T
covariates_df

In [None]:
phenotype_pos_df = pd.read_csv("/path/of/herv_info.txt",sep = ",",index_col = 0,header = None)
phenotype_pos_df.columns = ["gene_id","chr","strand","pos","end"]
phenotype_pos_df.index = phenotype_pos_df['gene_id']
phenotype_pos_df = phenotype_pos_df[['chr','pos']]

In [None]:
phenotype_pos_df = phenotype_pos_df[phenotype_pos_df.index.isin(phenotype_df.index)]
phenotype_pos_df = phenotype_pos_df.reindex(phenotype_df.index)

In [None]:
genotype_df = pd.read_csv("/path/of/genotype/genotype_all.csv",sep = "\t",index_col = 0)
genotype_df

In [None]:
variant_df = pd.read_csv("/path/of/snp_info_uniq.txt",sep = "," ,index_col = 0)
variant_df = variant_df[variant_df.index.isin(genotype_df.index)]
variant_df = variant_df[['chr','start']]
variant_df.columns = ['chrom','pos']
variant_df['index'] = range(0,len(variant_df))
variant_df = variant_df.reindex(genotype_df.index)

In [None]:
# map all cis-associations (results for each chromosome are written to file)

for chrom in range(1,23):
    cis.map_nominal(genotype_df, variant_df,
                    phenotype_df.loc[phenotype_pos_df[phenotype_pos_df['chr'] == chrom].index],
                    phenotype_pos_df[phenotype_pos_df['chr'] == chrom],
                    prefix=prefix, covariates_df=covariates_df)
    # load results
    pairs_df = pd.read_parquet(f'{prefix}.cis_qtl_pairs.chr{chrom}.parquet')
    pairs_df.to_csv(f'{prefix}_{chrom}.csv')

In [None]:
cis_df = cis.map_cis(genotype_df, variant_df, 
                     phenotype_df,
                     phenotype_pos_df,
                     covariates_df, seed=123456)

In [None]:
# compute q-values (in practice, this must be run on all genes, not a subset)
post.calculate_qvalues(cis_df, fdr=0.05, qvalue_lambda=0.85)

In [None]:
indep_df = cis.map_independent(genotype_df, variant_df, cis_df,
                               phenotype_df, phenotype_pos_df, covariates_df)

In [None]:
indep_df['gene_snp'] = indep_df['phenotype_id'] + "@" + indep_df['variant_id']

In [None]:
indep_df.to_csv(f"{prefix}_indepent.csv")