## Notebook for prepping the ADRD Aging genotypes for NIMH Human Brain Collection Core
original 3 Illumina chip types ped/map files from [here](gs://nihnialng-aging-brain/genotypes); H1M, H5M4, M650K

- H1M=Human1M-Duov3_B
- H650K=HumanHap650Yv3.0
- H5M4=HumanOmni5-Quad

basically need to:
- merge genotypes from different Illumina platforms, have two sets of these one from phase1 and seperate set for just the additional phase2 subjects
- liftover from hg19 to hg38
- re-order the chromosomes from typical to 10X's lexigraphical

In [1]:
!date

Fri Aug 11 16:02:18 EDT 2023


#### import libraries

In [2]:
from pandas import read_csv, DataFrame
from os.path import exists
from os import sched_getaffinity
from numpy import append

#### set notebook variables

In [3]:
# naming
proj_name = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
genos_dir = f'{wrk_dir}/genotypes'
info_dir = f'{wrk_dir}/sample_info'
public_dir = f'{wrk_dir}/public'
src_dir = f'{wrk_dir}/src_data'

# in files
info_file = f'{info_dir}/{proj_name}.sample_info.csv'
demuxlet_vcf_file = f'{genos_dir}/{proj_name}.hg38.demuxlet.vcf.gz'
fasta_index_bucket_path = 'gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta*'
fasta_dict_bucket_path = 'gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict'

# variables
DEBUG = False
cpu_count = len(sched_getaffinity(0))
capture_out = !grep MemTotal /proc/meminfo | awk '{print $2}'
max_mem = int(capture_out[0])

hbcc_ped_prefixes = ['H1M', 'H5M4', 'H650K']
# when combining final vcf for demuxlet need to have chromosomes 
#sorted to match 10x lexigraphical
autosomes = [1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 2, 20, 21, 22, 3, 4, 5, 6, 7, 8, 9]
lane_range = range(1, 9)

#### utility functions

In [4]:
def run_bash_command(cmd_line: str, verbose: bool=False):
    if verbose:
        print(cmd_line)
    !{cmd_line}

### load sample info

In [5]:
info_df = read_csv(info_file)
print(f'shape of sample info: {info_df.shape}')
if DEBUG:
    display(info_df.head())

shape of sample info: (36, 13)


#### drop the non-pooled samples and make sure pool nums are ints

In [6]:
info_df = info_df.loc[(~info_df.gex_pool.isna()) & (~info_df.atac_pool.isna())]
print(f'shape of info {info_df.shape}')
# make sure pool nums are ints and not floats
info_df.gex_pool = info_df.gex_pool.astype('int')
info_df.atac_pool = info_df.atac_pool.astype('int')
if DEBUG:
    display(info_df.head())
    display(info_df.gex_pool.value_counts())
    display(info_df.atac_pool.value_counts())    

shape of info (33, 13)


### convert pedmaps to bfiles

In [7]:
for phase in ['phase1', 'phase2']:
    for chip_type in hbcc_ped_prefixes:
        this_cmd = f'plink --ped {genos_dir}/{phase}/{chip_type}.ped \
--map {genos_dir}/{phase}/{chip_type}.map \
--make-bed --out {genos_dir}/{phase}/{proj_name}_{chip_type} --silent'
        run_bash_command(this_cmd, DEBUG)

/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/phase1/aging_phase2_H1M.hh
); many commands treat these as missing.
treat these as missing.
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/phase1/aging_phase2_H5M4.hh
); many commands treat these as missing.
treat these as missing.
treat these as missing.
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/phase2/aging_phase2_H1M.hh
); many commands treat these as missing.
treat these as missing.
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/phase2/aging_phase2_H5M4.hh
); many commands treat these as missing.
treat these as missing.
treat these as missing.


### merge the project phases per geno chip platform

In [8]:
for chip_type in hbcc_ped_prefixes:
    print(chip_type)
    this_cmd = f'plink --bfile {genos_dir}/phase1/{proj_name}_{chip_type} \
--bmerge {genos_dir}/phase2/{proj_name}_{chip_type} \
--out {genos_dir}/{proj_name}_{chip_type} --silent'
    run_bash_command(this_cmd, DEBUG)  

H1M
H5M4
H650K


In [9]:
!ls -lh {genos_dir}

total 284M
-rw-rw-r--. 1 gibbsr gibbsr 4.6M Aug 11 16:02 aging_phase2_H1M.bed
-rw-rw-r--. 1 gibbsr gibbsr  39M Aug 11 16:02 aging_phase2_H1M.bim
-rw-rw-r--. 1 gibbsr gibbsr  402 Aug 11 16:02 aging_phase2_H1M.fam
-rw-rw-r--. 1 gibbsr gibbsr  13K Aug 11 16:02 aging_phase2_H1M.log
-rw-rw-r--. 1 gibbsr gibbsr  13M Aug 11 16:02 aging_phase2_H5M4.bed
-rw-rw-r--. 1 gibbsr gibbsr 146M Aug 11 16:02 aging_phase2_H5M4.bim
-rw-rw-r--. 1 gibbsr gibbsr  378 Aug 11 16:02 aging_phase2_H5M4.fam
-rw-rw-r--. 1 gibbsr gibbsr 5.1M Aug 11 16:02 aging_phase2_H5M4.log
-rw-rw-r--. 1 gibbsr gibbsr 1.9M Aug 11 16:02 aging_phase2_H650K.bed
-rw-rw-r--. 1 gibbsr gibbsr  22M Aug 11 16:02 aging_phase2_H650K.bim
-rw-rw-r--. 1 gibbsr gibbsr  252 Aug 11 16:02 aging_phase2_H650K.fam
-rw-rw-r--. 1 gibbsr gibbsr 1.6K Aug 11 16:02 aging_phase2_H650K.log
drwxrwxr-x. 2 gibbsr gibbsr 4.0K Aug 11 16:02 phase1
drwxrwxr-x. 2 gibbsr gibbsr 8.0K Aug 11 16:02 phase2


#### see what variants are shared between platforms provided, by variant name

In [10]:
shared_variants = None
for chip_type in hbcc_ped_prefixes:
    bim_df = read_csv(f'{genos_dir}/{proj_name}_{chip_type}.bim', sep='\s+', header=None)
    # in map file variants name is 2nd column
    if shared_variants is None:
        shared_variants = set(bim_df[1])
    else:
        shared_variants = shared_variants & set(bim_df[1])
    print(f'{chip_type} shape={bim_df.shape} shared variant size = {len(shared_variants)}')

H1M shape=(1192666, 6) shared variant size = 1192666
H5M4 shape=(4437269, 6) shared variant size = 971362
H650K shape=(660918, 6) shared variant size = 561035


#### looks like a definitely a decent number of variants shared, proceed to merging

#### merge the per Illumina chip type for the plink bfiles

In [11]:
# merge the files into a single plink binary set
merge_file_set = f'{genos_dir}/merge-list.txt'
bfile_set = f'{genos_dir}/{proj_name}'

with open(merge_file_set, 'w') as file_handler:
    for chip_type in hbcc_ped_prefixes:
        prefix_file_set = f'{genos_dir}/{proj_name}_{chip_type}'
        file_handler.write(f'{prefix_file_set}\n')

# merge the per platform bfiles into a merged bfile
this_cmd = f'plink --merge-list {merge_file_set} --make-bed --allow-no-sex \
--keep-allele-order --silent --out {bfile_set}'
run_bash_command(this_cmd, DEBUG)

# if there was a missnp problem remove those variant and re-attemp merge
if exists(f'{bfile_set}-merge.missnp'):
    for chip_type in hbcc_ped_prefixes:
        this_cmd = f'plink --bfile {genos_dir}/{proj_name}_{chip_type} \
--silent --exclude {bfile_set}-merge.missnp \
--keep-allele-order --make-bed \
--out {genos_dir}/{proj_name}_{chip_type}.temp'
        run_bash_command(this_cmd, DEBUG)

    with open(merge_file_set, 'w') as file_handler:
        for chip_type in hbcc_ped_prefixes:
            prefix_file_set = f'{genos_dir}/{proj_name}_{chip_type}.temp'
            file_handler.write(f'{prefix_file_set}\n')
        
    this_cmd = f'plink --merge-list {merge_file_set} --make-bed --allow-no-sex \
--keep-allele-order --silent --out {bfile_set} --geno 0.05'
    run_bash_command(this_cmd, DEBUG)

Error: 1 variant with 3+ alleles present.
* If you believe this is due to strand inconsistency, try --flip with
  /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2-merge.missnp.
  alleles probably remain in your data.  If LD between nearby SNPs is high,
  --flip-scan should detect them.)
* If you are dealing with genuine multiallelic variants, we recommend exporting
  that subset of the data to VCF (via e.g. '--recode vcf'), merging with
  another tool/script, and then importing the result; PLINK is not yet suited
  to handling them.
See https://www.cog-genomics.org/plink/1.9/data#merge3 for more discussion.
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2_H1M.temp.hh
); many commands treat these as missing.
treat these as missing.
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2_H5M4.temp.hh
); many commands treat these as missing.
treat these as missing.
treat these as missing.
/labshare/raph/datasets/

In [12]:
!ls -lhtr {genos_dir}/{proj_name}.*

-rw-rw-r--. 1 gibbsr gibbsr  162 Aug 11 16:03 /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.nosex
-rw-rw-r--. 1 gibbsr gibbsr 446K Aug 11 16:03 /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hh
-rw-rw-r--. 1 gibbsr gibbsr 4.5M Aug 11 16:03 /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.bed
-rw-rw-r--. 1 gibbsr gibbsr 1.1K Aug 11 16:03 /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.fam
-rw-rw-r--. 1 gibbsr gibbsr  17M Aug 11 16:03 /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.bim
-rw-rw-r--. 1 gibbsr gibbsr  51M Aug 11 16:03 /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.log


In [13]:
!tail -n 25 {genos_dir}/{proj_name}.log

35 people (15 males, 10 females, 10 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.nosex
.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 35 founders and 0 nonfounders present.
Calculating allele frequencies... done.
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hh
); many commands treat these as missing.
treat these as missing.
Total genotyping rate is 0.446112.
4141901 variants removed due to missing genotype data (--geno).
523522 variants and 35 people pass filters and QC.
Note: No phenotypes present.
--make-bed to
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.bed
+
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.bim
+
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.fam
... done.

End time: Fri Aug 11 16:03:32 2023


#### load the bim files see chrom counts

In [14]:
bim_df = read_csv(f'{bfile_set}.bim', sep='\s+', header=None)
print(f'shape of bim file {bim_df.shape}')
if DEBUG:
    display(bim_df.sample(10))
    display(bim_df[0].value_counts())

shape of bim file (523522, 6)


#### find variants to exclude that aren't SNVs

In [15]:
display(bim_df[4].value_counts())
display(bim_df[5].value_counts())
nucleotides = ['A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n', 0]
vars_to_include = bim_df.loc[(bim_df[4].isin(nucleotides)) & 
                             (bim_df[5].isin(nucleotides))]
print(f'vars to include shape {vars_to_include.shape}')
if DEBUG:
    display(vars_to_include.head())

vars_to_include[1].to_csv(f'{genos_dir}/variants_to_keep.txt', index=False, header=False)

A    278976
G    193036
C     45887
0      5619
T         4
Name: 4, dtype: int64

A    241467
G    230607
C     51430
T        18
Name: 5, dtype: int64

vars to include shape (517903, 6)


### convert to vcf, exclude InDels

In [16]:
this_cmd = f'plink2 --bfile {bfile_set} --silent \
--export vcf-4.2 bgz id-paste=iid --out {bfile_set} \
--output-chr chrM --not-chr 0 --snps-only \
--extract {genos_dir}/variants_to_keep.txt'

run_bash_command(this_cmd, DEBUG)

### re-ID the NIMH HBCC samples using the provided mapping

In [17]:
# create a rename file for bcftools to use
temp_df = info_df.loc[~info_df.geno_IID.isna()]
temp_df[['geno_IID', 'sample_id']].to_csv(f'{bfile_set}.rename.sample.list', 
                                        index=False, header=False, sep='\t')

In [18]:
in_vcf = f'{bfile_set}.vcf.gz'
out_vcf = f'{bfile_set}.renamed.vcf.gz'
this_cmd = f'bcftools reheader --sample {bfile_set}.rename.sample.list \
--output {out_vcf} --threads 2 {in_vcf}'
run_bash_command(this_cmd, DEBUG)

#### check which subjects are present and properly renamed

In [19]:
temp_sample_list_file = f'{bfile_set}.sample.list'
this_cmd = f'bcftools query --list-samples {out_vcf} > {temp_sample_list_file}'
run_bash_command(this_cmd, DEBUG)
ids_present_df = read_csv(temp_sample_list_file, header=None)
ids_present_df.columns = ['sample_id']
print(f'shape of IDs present {ids_present_df.shape}')

# which expected aren't present
temp_result = set(info_df.sample_id) - set(ids_present_df.sample_id)
print(f'expected but not found {temp_result}')
# and then found but not exprected
temp_result = set(ids_present_df.sample_id) - set(info_df.sample_id)
print(f'found but not expected {temp_result}')

shape of IDs present (35, 1)
expected but not found {'Aging134', 'Aging142'}
found but not expected {'4040296074_A', '3999495136_R02C01', '4463344122_R01C02', '4572348740_R01C02'}


#### need to liftover from hg19 to hg38
use Picard, slower but better than CrossMap

#### get Picard

In [20]:
# grab picard jar
!wget --quiet  https://github.com/broadinstitute/picard/releases/download/2.27.5/picard.jar \
-O {public_dir}/picard.jar

#### grab necessary ref files

In [21]:
this_cmd = f'gsutil -mq cp -P {fasta_index_bucket_path} {public_dir}/'
run_bash_command(this_cmd, DEBUG)

this_cmd = f'gsutil -mq cp -P {fasta_dict_bucket_path} {public_dir}/'
run_bash_command(this_cmd, DEBUG)

In [22]:
# also need the hg19 files
this_cmd = f'gsutil -mq cp -P gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta* {public_dir}/'
run_bash_command(this_cmd, DEBUG)

this_cmd = f'gsutil -mq cp -P gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict {public_dir}/'
run_bash_command(this_cmd, DEBUG)

In [23]:
!wget --quiet  http://hgdownload.cse.ucsc.edu/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
-O {public_dir}/hg19ToHg38.over.chain.gz

#### run the liftover

In [24]:
vcf = f'{bfile_set}.renamed.vcf.gz'
out_vcf_name = f'{bfile_set}.hg38.vcf.gz'
out_vcf_unmapped_name = f'{bfile_set}.hg38unmapped.vcf.gz'

this_cmd = 	f'java -Xmx{max_mem}k -jar {public_dir}/picard.jar LiftoverVcf \
INPUT={vcf} \
OUTPUT={out_vcf_name} \
CHAIN={public_dir}/hg19ToHg38.over.chain.gz \
REJECT={out_vcf_unmapped_name} \
REFERENCE_SEQUENCE={public_dir}/Homo_sapiens_assembly38.fasta \
MAX_RECORDS_IN_RAM=5000000 QUIET=true RECOVER_SWAPPED_REF_ALT=true'

run_bash_command(this_cmd, DEBUG)

INFO	2023-08-11 16:06:41	LiftoverVcf	

********** NOTE: Picard's command line syntax is changing.
**********
********** For more information, please see:
********** 
https://github.com/broadinstitute/picard/wiki/Command-Line-Syntax-Transition-For-Users-(Pre-Transition)
**********
********** The command line looks like this in the new syntax:
**********
**********    LiftoverVcf -INPUT /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.renamed.vcf.gz -OUTPUT /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hg38.vcf.gz -CHAIN /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/public/hg19ToHg38.over.chain.gz -REJECT /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hg38unmapped.vcf.gz -REFERENCE_SEQUENCE /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/public/Homo_sapiens_assembly38.fasta -MAX_RECORDS_IN_RAM 5000000 -QUIET true -RECOVER_SWAPPED_REF_ALT true
**********


16:06:41.386 INFO  NativeLibrar

#### split vcf by chromosome so can be recombined in specified chromosome order

In [25]:
%%time
vcf_files = []
for chrom in autosomes:
    in_vcf = f'{bfile_set}.hg38.vcf.gz'
    out_vcf = f'{bfile_set}.hg38.chr{chrom}'
    # use plink2 instead of bcftools so header is reduced specifically for contigs
    this_cmd = f'plink2 --vcf {in_vcf} --silent --chr {chrom} --not-chr 0 \
--export vcf-4.2 bgz id-paste=iid --out {out_vcf} --output-chr chrM --allow-extra-chr'
    print(chrom, end='.')
    run_bash_command(this_cmd, DEBUG)

1.10.11.12.13.14.15.16.17.18.19.2.20.21.22.3.4.5.6.7.8.9.CPU times: user 67 ms, sys: 250 ms, total: 317 ms
Wall time: 5.72 s


#### concat chromosome vcfs (in order) into genome vcf

In [26]:
vcf_files = []
for chrom in autosomes:
    vcf_files.append(f'{bfile_set}.hg38.chr{chrom}.vcf.gz')

vcf_files_arg = ' '.join(vcf_files)
this_cmd = f'bcftools concat --output-type z --output {demuxlet_vcf_file} \
--threads {cpu_count} --no-version {vcf_files_arg}'   

run_bash_command(this_cmd, DEBUG)

Checking the headers and starting positions of 22 files
Concatenating /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hg38.chr1.vcf.gz	0.023344 seconds
Concatenating /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hg38.chr10.vcf.gz	0.009232 seconds
Concatenating /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hg38.chr11.vcf.gz	0.006755 seconds
Concatenating /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hg38.chr12.vcf.gz	0.005781 seconds
Concatenating /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hg38.chr13.vcf.gz	0.005124 seconds
Concatenating /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hg38.chr14.vcf.gz	0.006616 seconds
Concatenating /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/genotypes/aging_phase2.hg38.chr15.vcf.gz	0.004019 seconds
Concatenating /labshare/raph/datasets/adrd_neuro/brain_aging/ph

#### index the final vcf

In [27]:
!tabix --preset vcf {demuxlet_vcf_file}

### split vcfs per pool
ie split by vcfs by samples with many extra samples in the vcf demuxlet tends to end up under-assigning singlets, to me

smallish number of samples per pool, six, so can specify them on bcftools cmdline

need to force and ID update in for sample Aging134, both Aging104 and Aging134 and from same donor so duplicate but genotypes are currently labelled as Aging104; but since separate samples from same donor need to correct the sample ID, effects Pool6 for GEX and ATAC

#### need to force and ID update in for sample Aging134, both Aging104 and Aging134 and from same donor so duplicate but genotypes are currently labelled as Aging104; but since separate samples from same donor need to correct the sample ID, effects Pool6 for GEX and ATAC

#### format expected pool names

In [28]:
pool_names = []
pools = []
lanes = []
for pool in info_df.gex_pool.unique():
    for lane in lane_range:
        gex_sample = f'{src_dir}/gex/sample_ec_GEX_P{pool}_{lane}'
        if exists(gex_sample):
            pool_names.append(f'GEX_P{pool}_{lane}')
            pools.append(pool)
            lanes.append(lane)
for pool in info_df.atac_pool.unique():
    for lane in lane_range:
        atac_sample = f'{src_dir}/atac/sample_ec_ATAC_P{pool}_{lane}'
        if exists(atac_sample):
            pool_names.append(f'ATAC_P{pool}_{lane}')
            pools.append(pool)
            lanes.append(lane)            
this_data = {'name': pool_names, 'pool': pools, 'lane': lanes}
pools_df = DataFrame(data=this_data)
print(f'shape of expected pools {pools_df.shape}')
if DEBUG:
    display(pools_df.head())     

shape of expected pools (53, 3)


In [29]:
print(f'number of pools defined = {pools_df.shape}')
affected_pools = pools_df.loc[(pools_df.pool == 6)]
for row in pools_df.itertuples():
    if row.name.startswith('GEX'):
        pool_samples = info_df.loc[info_df.gex_pool == row.pool].sample_id.values
    elif row.name.startswith('ATAC'):
        pool_samples = info_df.loc[info_df.atac_pool == row.pool].sample_id.values
    if row.name in affected_pools.name.values:
        pool_samples = append(pool_samples, 'Aging104')
    fmt_samples_arg = ','.join(pool_samples)
    out_vcf = f'{genos_dir}/{proj_name}.hg38.demuxlet.{row.name}.vcf.gz'
    this_cmd = f'bcftools view --samples {fmt_samples_arg} --force-samples \
--output {out_vcf} --output-type z {demuxlet_vcf_file} --threads {cpu_count}'
    run_bash_command(this_cmd, DEBUG)
    # index the pool vcf
    run_bash_command(f'tabix --preset vcf {out_vcf}', DEBUG)

number of pools defined = (53, 3)
['Aging130' 'Aging131' 'Aging132' 'Aging133' 'Aging134' 'Aging135'
 'Aging138' 'Aging139' 'Aging104']
Warn: subset called for sample that does not exist in header: "Aging134"... skipping
['Aging130' 'Aging131' 'Aging132' 'Aging133' 'Aging134' 'Aging135'
 'Aging138' 'Aging139' 'Aging104']
Warn: subset called for sample that does not exist in header: "Aging134"... skipping
['Aging130' 'Aging131' 'Aging132' 'Aging133' 'Aging134' 'Aging135'
 'Aging138' 'Aging139' 'Aging104']
Warn: subset called for sample that does not exist in header: "Aging134"... skipping
['Aging130' 'Aging131' 'Aging132' 'Aging133' 'Aging134' 'Aging135'
 'Aging138' 'Aging139' 'Aging104']
Warn: subset called for sample that does not exist in header: "Aging134"... skipping
['Aging130' 'Aging131' 'Aging132' 'Aging133' 'Aging134' 'Aging135'
 'Aging138' 'Aging139' 'Aging104']
Warn: subset called for sample that does not exist in header: "Aging134"... skipping
['Aging130' 'Aging131' 'Aging13

In [30]:
# create a rename file for bcftools to use
pool6_rename_file = f'{genos_dir}/P6.duplicate.rename.list'
this_cmd = f'echo "Aging104\tAging134" > {pool6_rename_file}'
!{this_cmd}

In [31]:

for pool in affected_pools.name:
    in_vcf = f'{genos_dir}/{proj_name}.hg38.demuxlet.{pool}.vcf.gz'
    temp_vcf = f'{genos_dir}/{proj_name}.hg38.demuxlet.{pool}.temp.vcf.gz'
    this_cmd = f'mv {in_vcf} {temp_vcf}'
    run_bash_command(this_cmd, DEBUG)
    this_cmd = f'bcftools reheader --sample {pool6_rename_file} \
    --output {in_vcf} --threads 2 {temp_vcf}'
    run_bash_command(this_cmd, DEBUG)
    # index the pool vcf
    run_bash_command(f'tabix -f --preset vcf {out_vcf}', DEBUG)
    # remove temp file
    run_bash_command(f'rm {temp_vcf}', DEBUG)    

In [32]:
!date

Fri Aug 11 16:08:35 EDT 2023
