#### Notebook to format genotypes for use with tensorQTL

typically store wgs genotypes by chromosome in vcf or plink2 pfiles
tensorQTL using plink1 bfiles, so convert, also since small cohort go ahead and merge from per chromosome to genome

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import concurrent.futures
import os
import pandas as pd

In [None]:
# parameters
cohort = 'biofind'
amp_abbr = 'BF'
version = 'amppdv1'

In [None]:
# naming
cohort_version = f'{cohort}.{version}'

# directories
wrk_dir = f'/home/jupyter/{cohort}'
geno_dir = f'{wrk_dir}/genotypes'
expr_dir = f'{wrk_dir}/expression'
info_dir = f'{wrk_dir}/sample_info'

# input files
pfiles = '{genodir}/{cohortversion}.chr{chr}'

# output files
genome_bfile = f'{geno_dir}/{cohort_version}.bfile'
risk_bfile = f'{geno_dir}/{cohort_version}.risk.bfile'
chr_detected_out_file = '{exprdir}/{cohortbuild}.detected.genes.chr{chr}'

# constant values
autosomes = [str(x) for x in list(range(1,23))]
max_dist = 1000000
capture_out = !(nproc)
max_threads = int(capture_out[0])
alpha_value = 0.05
max_feature_cnt_parallel_load = 20000

#### utility functions

In [None]:
def run_bash_cmd(this_cmd):
    !{this_cmd}

#### convert from plink2 pfiles to plink bfiles

In [None]:
with concurrent.futures.ProcessPoolExecutor() as ppe:
    for chrom in autosomes:
        this_pfile = pfiles.format(genodir=geno_dir, cohortversion=cohort_version, chr=chrom)
        this_cmd = f'plink2 --pfile {this_pfile} --make-bed --out {this_pfile}.bfile --silent'
#         print(this_cmd)
        ppe.submit(run_bash_cmd, this_cmd)    

In [None]:
# merge the files into a single plink binary set

def frmt_merge_list_file(geno_dir, cohort_version, autosomes):
    merge_file_set = f'{geno_dir}/bfile_merge-list.txt'
    with open(merge_file_set, 'w') as file_handler:
        for chrom in autosomes:
            this_pfile = pfiles.format(genodir=geno_dir, cohortversion=cohort_version, chr=chrom)
            file_handler.write(f'{this_pfile}.bfile\n')
    return merge_file_set

def run_plink_bfile_merge(merge_file_set, genome_bfile):
    this_cmd = f'plink --merge-list {merge_file_set} --make-bed --allow-no-sex \
    --silent --out {genome_bfile} --maf 0.01 --geno 0.05 --hwe 0.000001'
    !{this_cmd}

# merge the per chrom bfiles into a genome bfile
merge_file_set = frmt_merge_list_file(geno_dir, cohort_version, autosomes)
run_plink_bfile_merge(merge_file_set, genome_bfile)

# if there was a missnp problem remove those variant and re-attemp merge
if os.path.exists(f'{genome_bfile}-merge.missnp'):
    print('removing problem variants and retrying merge')
    with concurrent.futures.ProcessPoolExecutor() as ppe:
        for chrom in autosomes:
            this_pfile = pfiles.format(genodir=geno_dir, cohortversion=cohort_version, chr=chrom)
            this_cmd = f'plink2 --pfile {this_pfile} --make-bed --out {this_pfile}.bfile \
--silent --exclude {genome_bfile}-merge.missnp'
    #         print(this_cmd)
            ppe.submit(run_bash_cmd, this_cmd)           

    # try the merge again
    merge_file_set = frmt_merge_list_file(geno_dir, cohort_version, autosomes)
    run_plink_bfile_merge(merge_file_set, genome_bfile)

In [None]:
!ls {genome_bfile}*
!head {genome_bfile}.log
!tail {genome_bfile}.log