## Notebook to split larger genotype callset into individual AMP-PD cohorts

#### import libraries and set notebook variables

In [1]:
import pandas as pd
import os
import threading

In [2]:
# directories
wrk_dir = '/labshare/raph/datasets/amppd'
genomes_dir = '/labseq/projects/neurod_wgs/genotypes'
source_pfiles_prefix = f'{genomes_dir}/neurod.freeze9'
genos_dir = f'{wrk_dir}/genotypes'
info_dir = f'{wrk_dir}/sample_info'

# constants 
autosomes = [str(x) for x in list(range(1,23))]
sexomes = ['X']
chromosomes = autosomes + sexomes

cohort_prefix = {'ppmi': 'PP-', 'pdbp': 'PD-', 'biofind': 'BF-', 'hbs': 'HB-'}

build = 'freeze9'

amppd_psam = '/labseq/projects/neurod_wgs/sample_info/amppd.psam'

#### threading related functions

In [3]:
#### threading related function

# run command line procsss with bash majic
# make this a small function so it can be target function for threading
def run_bash_cmd(this_cmd):
    !{this_cmd}

# for each bash command is list add to execution thread, join and wait til all done
def run_bash_cmds_threaded(cmd_list):
    job_threads = []
    for this_cmd in cmd_list:
        this_thread = threading.Thread(target=run_bash_cmd, args=(this_cmd,))
        job_threads.append(this_thread)
        this_thread.start()

    for job_thread in job_threads:
        job_thread.join()          

#### make sure the needed output directories exist

In [4]:
os.makedirs(genos_dir, exist_ok=True)
os.makedirs(info_dir, exist_ok=True)

#### generate per cohort psams from larger cohort

In [5]:
# read the psam
psam_df = pd.read_csv(amppd_psam, sep='\s+')
print(psam_df.shape)
psam_df.head()

(4298, 4)


Unnamed: 0,#FID,IID,SEX,DX
0,PP-41564,PP-41564,1,GR-unaffected
1,PD-PDZV843ATF,PD-PDZV843ATF,1,Control
2,PD-PDCK871NBR,PD-PDCK871NBR,1,Control
3,PP-51718,PP-51718,1,GR-unaffected
4,PP-56267,PP-56267,1,GR-unaffected


In [6]:
for cohort, prefix in cohort_prefix.items():
    print(f'{cohort} {prefix}')
    cohort_psam_df = psam_df.loc[psam_df['IID'].str.startswith(prefix)]
    print(cohort_psam_df.shape)
    cohort_psam_file = f'{info_dir}/{cohort}.psam'
    cohort_psam_df.to_csv(cohort_psam_file, index=False, sep='\t')

ppmi PP-
(1610, 4)
pdbp PD-
(1599, 4)
biofind BF-
(213, 4)
hbs HB-
(876, 4)


#### for each cohort subset plink2 pfile set

In [8]:
def frmt_plink2_subset(in_pfiles_prefix, genos_dir, out_name, chrom, 
                       keep_file, min_mac=1):
    in_pfiles = f'{in_pfiles_prefix}.chr{chrom}'
    out_pfiles = f'{genos_dir}/{out_name}.chr{chrom}'
    if in_pfiles == out_pfiles:
        print('in name cannot be same of out name, here')
        plink_cmd = '#error'

    filter_pass = ' --var-filter'
    plink_cmd = f'plink2 --pfile {in_pfiles} --keep {keep_file} \
--mac {min_mac} --silent --make-pgen --out {out_pfiles}'
    return plink_cmd

for cohort in cohort_prefix.keys():
    print(cohort)
    keep_file = f'{info_dir}/{cohort}.psam'
    cohort_build = f'{cohort}.{build}'
    cmds = [frmt_plink2_subset(source_pfiles_prefix, genos_dir, cohort_build, chrom, 
                               keep_file) for chrom in chromosomes] 

#     print(cmds)
    # now run the cmds concurrently    
    run_bash_cmds_threaded(cmds)  

ppmi
pdbp
biofind
hbs
