In [1]:
import numpy as np 
import pandas as pd
import SAGEnet.data 
import SAGEnet.tools
from SAGEnet.models import pSAGEnet
from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import glob
import os
import pysam

  warn(


#### In this notebook, we go through the process of initializing a PersonalGenomeDataset and using this to train a p-SAGE-net model with the GEUVADIS dataset (publicly available).

#### Before you run this notebook: 
-- follow the steps under "installation" in the main READme to install dependencies and the SAGEnet package.

-- download the pre-processed GEUVADIS expression data 'tpm_pca_annot.csv.gz' from Rastogi et al.: https://github.com/ni-lab/finetuning-enformer/tree/main/process_geuvadis_data/tpm  

-- download the GEUVADIS VCF for chromosome 21 (to use as an example) from here: https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEUV-1. The file is called 'GEUVADIS.chr21.PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.vcf.gz'  

-- run the following lines of code to be able to use pysam with this VCF file:   
```bash
gunzip GEUVADIS.chr21.PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.vcf.gz # decompress 
bgzip GEUVADIS.chr21.PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.vcf # recompress using bgzip
tabix -p vcf GEUVADIS.chr21.PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.vcf.gz # index   
```

-- download the hg19 genome (to be consisent with GEUVADIS variant calls):    
```bash
curl -O https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz  
gunzip hg19.fa.gz
```

#### Change these paths based on your SAGEnet repo locaiton, where you have saved the files downloaded in the previous step, and where you want to save your model results. 

In [2]:
base_repo_path='/homes/gws/aspiro17/SAGEnet/'

hg19_path='/data/mostafavilab/personal_genome_expr/revisions/GEUVADIS/hg19.fa'
expr_path='/data/mostafavilab/personal_genome_expr/revisions/GEUVADIS/corrected_log_tpm.annot.csv.gz'
vcf_file_path='/data/mostafavilab/personal_genome_expr/revisions/GEUVADIS/all_vcf_files/GEUVADIS.chr21.PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.bgz'

model_save_dir='data/mostafavilab/personal_genome_expr/revisions/GEUVADIS/res/'
os.makedirs(model_save_dir, exist_ok=True)

#### Use the tss_data_path provided in this github. Since the GEUVADIS variant calls are with respect to hg19 (not hg38, as was the case with our ROSMAP and GTEx data), update the tss and chr columns in gene_meta_info to reflect this. 

In [3]:
tss_data_path=f'{base_repo_path}input_data/gene-ids-and-positions.tsv'
gene_meta_info = pd.read_csv(tss_data_path, sep="\t",index_col='region_id')
gene_meta_info['chr'] = gene_meta_info['chr_hg19'].str.replace('chr', '', regex=False)
gene_meta_info['tss'] = pd.to_numeric(gene_meta_info['tss_hg19'], errors='coerce').astype('Int64')
gene_meta_info

Unnamed: 0_level_0,gene_name,gene_id,chr_hg38,start_hg38,end_hg38,strand_hg38,tss_hg38,chr_hg19,tss_hg19,tss,chr,ensg,strand,pos
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ENSG00000223972,DDX11L1,ENSG00000223972,1,11869,14409,+,11869,chr1,11869.0,11869,1,ENSG00000223972,+,11869
ENSG00000227232,WASH7P,ENSG00000227232,1,14404,29570,-,29570,chr1,29570.0,29570,1,ENSG00000227232,-,29570
ENSG00000278267,MIR6859-1,ENSG00000278267,1,17369,17436,-,17436,chr1,17436.0,17436,1,ENSG00000278267,-,17436
ENSG00000243485,MIR1302-2HG,ENSG00000243485,1,29554,31109,+,29554,chr1,29554.0,29554,1,ENSG00000243485,+,29554
ENSG00000284332,MIR1302-2,ENSG00000284332,1,30366,30503,+,30366,chr1,30366.0,30366,1,ENSG00000284332,+,30366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000271254,AC240274.1,ENSG00000271254,KI270711.1,4612,29626,-,29626,,,,,ENSG00000271254,-,29626
ENSG00000275405,U1,ENSG00000275405,KI270713.1,21861,22024,-,22024,,,,,ENSG00000275405,-,22024
ENSG00000275987,U1,ENSG00000275987,KI270713.1,30437,30580,-,30580,,,,,ENSG00000275987,-,30580
ENSG00000277475,AC213203.1,ENSG00000277475,KI270713.1,31698,32528,-,32528,,,,,ENSG00000277475,-,32528


#### Load the preprocessed expression data:

In [4]:
orig_expr_df = pd.read_csv(expr_path)
orig_expr_df

Unnamed: 0,TargetID,Gene_Symbol,Chr,Coord,HG00096,HG00097,HG00099,HG00100,HG00101,HG00102,...,NA20828,stable_id,gencode_v12_gene_name,our_gene_name,EUR_eGene,YRI_eGene,top_EUR_eqtl_rsid,top_YRI_eqtl_rsid,top_EUR_eqtl_distance,top_YRI_eqtl_distance
0,ENSG00000257527.1,ENSG00000257527.1,16,18505708,-0.057361,-0.313160,-0.684395,-1.209085,-0.012644,-0.270612,...,-1.127696,ENSG00000257527,rp11-1212a22.6,,False,False,,,,
1,ENSG00000151503.7,ENSG00000151503.7,11,134095348,3.653703,3.555238,3.969966,3.832266,3.620463,3.682108,...,3.984807,ENSG00000151503,ncapd3,,False,False,,,,
2,ENSG00000254681.2,ENSG00000254681.2,16,18495797,2.088882,2.326419,2.128807,2.199625,2.331783,2.627187,...,1.565265,ENSG00000254681,rp11-1212a22.3,,False,False,,,,
3,ENSG00000228477.1,ENSG00000228477.1,1,40428352,5.579332,5.352685,5.758683,6.045576,5.563191,5.176924,...,5.187391,ENSG00000228477,rp3-342p20.2,,False,False,,,,
4,ENSG00000159733.9,ENSG00000159733.9,4,2420390,-0.984586,-1.124469,-0.433654,-1.025796,-0.705150,-1.333362,...,0.044033,ENSG00000159733,zfyve28,zfyve28,True,False,rs4974687,,9347.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23717,ENSG00000137709.4,ENSG00000137709.4,11,120107349,-1.573193,0.184419,-1.035174,0.130528,-0.713233,0.147750,...,-0.414130,ENSG00000137709,pou2f3,,False,False,,,,
23718,ENSG00000006007.7,ENSG00000006007.7,16,19533467,2.938774,2.976678,2.681771,2.732348,2.782939,3.024868,...,3.017779,ENSG00000006007,gde1,,False,False,,,,
23719,ENSG00000172297.6,ENSG00000172297.6,Y,27600708,-1.760798,-1.955373,-1.859498,-1.676782,-4.246593,-2.011718,...,-2.351444,ENSG00000172297,golga2p3y,,False,False,,,,
23720,ENSG00000125266.5,ENSG00000125266.5,13,107187462,-1.917913,-1.706478,-1.403048,-2.520733,-1.504605,-1.454579,...,-2.262784,ENSG00000125266,efnb2,,False,False,,,,


#### Select some example individuals from the GEUVADIS dataset to use as a train, validaiton, and test set (make sure these sample names also exist in the expression data): 

In [5]:
with pysam.VariantFile(vcf_file_path) as vcf:
    all_samps = list(vcf.header.samples)

samps_in_expr_data = [samp for samp in all_samps if samp in orig_expr_df.columns]
print(f'total n samps: {len(samps_in_expr_data)}')

# shuffle and split into train, validation, and test
np.random.seed(12)
shuffled_indices = np.random.permutation(len(samps_in_expr_data))
shuffled_individs = np.array(samps_in_expr_data)[shuffled_indices]  # Convert to numpy array for indexing

# Split data
train_samps = shuffled_individs[:10]
val_samps = shuffled_individs[10:15]
test_samps = shuffled_individs[15:20]
all_samps = np.concatenate((train_samps,val_samps,test_samps))

print(f'train samps: {train_samps}')
print(f'val samps: {val_samps}')
print(f'test samps: {test_samps}')

total n samps: 462
train samps: ['NA20803' 'NA12830' 'HG00308' 'NA11892' 'NA12249' 'NA20804' 'NA12383'
 'NA12005' 'NA18498' 'NA19204']
val samps: ['NA18933' 'NA20536' 'HG00366' 'HG00277' 'NA20514']
test samps: ['HG00325' 'NA12272' 'HG00100' 'HG00332' 'NA20506']


[W::bcf_hdr_check_sanity] GL should be declared as Number=G


#### Select an example gene set to use for model training. Make sure that these genes are on chromosome 21 (this is the example VCF we are using). Usually we would split our gene set into train, validation, and test by chromosome, but since all of these example genes are from chromosome 21, we split randomly. 

In [7]:
chr21_genes = gene_meta_info[gene_meta_info['chr']=='21'].index.values
chr21_genes_in_expr_data = np.intersect1d(orig_expr_df['stable_id'], chr21_genes)

train_genes=chr21_genes_in_expr_data[:10]
val_genes=chr21_genes_in_expr_data[10:15]
test_genes=chr21_genes_in_expr_data[15:20]

print(train_genes)

['ENSG00000141956' 'ENSG00000141959' 'ENSG00000142149' 'ENSG00000142156'
 'ENSG00000142166' 'ENSG00000142168' 'ENSG00000142173' 'ENSG00000142178'
 'ENSG00000142185' 'ENSG00000142188']


#### Put the expression data into the format required by PersonalGenomeDataset (indexed by gene IDs, column values are sampele names):

In [8]:
expr_df = orig_expr_df[all_samps]
expr_df.index=orig_expr_df['stable_id']
expr_df

Unnamed: 0_level_0,NA20803,NA12830,HG00308,NA11892,NA12249,NA20804,NA12383,NA12005,NA18498,NA19204,NA18933,NA20536,HG00366,HG00277,NA20514,HG00325,NA12272,HG00100,HG00332,NA20506
stable_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ENSG00000257527,-0.955159,-1.045982,-0.898474,-1.269318,-0.439611,-1.501674,-0.995744,-1.149879,-0.563737,-4.145182,-0.436156,-0.204412,-0.663628,0.136591,-0.832092,-0.838569,-0.493279,-1.209085,-0.170537,-0.912007
ENSG00000151503,3.577920,3.348796,3.731877,3.543473,2.912565,3.160353,3.774759,3.223603,3.737564,3.546188,3.161365,3.816250,3.732621,3.606528,3.703678,3.824468,3.823922,3.832266,3.642056,3.665946
ENSG00000254681,2.036221,2.381775,1.782837,2.297884,1.203374,2.326845,2.016927,2.308166,2.080930,2.606330,2.723239,2.120876,1.989878,2.151429,1.922922,2.122954,2.603052,2.199625,2.489873,2.179513
ENSG00000228477,5.368516,5.535018,5.634284,5.643026,5.420429,5.404369,5.088399,5.791136,5.688514,5.398632,5.359600,5.697734,5.582426,5.460593,5.795808,5.631186,5.169079,6.045576,5.560037,5.496706
ENSG00000159733,-1.109934,0.777448,-1.476499,0.361167,1.048617,0.379605,0.330535,-0.719827,0.589121,-0.351958,-1.245378,-0.539014,-0.348463,0.695440,0.097511,-0.615162,0.057834,-1.025796,-0.905647,0.471690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000137709,-0.753234,1.269964,1.190510,0.744741,1.454462,-0.052893,-0.243307,0.646964,-0.528754,0.574150,-0.544949,-0.964710,0.262337,-1.183316,0.643111,0.722426,0.865159,0.130528,-0.190743,-0.380437
ENSG00000006007,2.873027,2.954443,2.898562,3.062070,2.956746,2.952475,2.862663,2.795224,2.872930,2.854083,2.841855,3.065036,3.412581,2.847144,2.869451,2.781907,2.691973,2.732348,3.031229,3.016198
ENSG00000172297,-2.161487,-1.467204,-1.569799,-3.318952,-3.600092,-2.635539,-1.343513,-2.140923,-2.417205,-4.356008,-2.056515,-1.888358,-1.292487,-1.651868,-2.030265,-2.172085,-1.980753,-1.676782,-2.083555,-2.157134
ENSG00000125266,-2.262790,-4.316878,-1.681150,-0.973592,-1.541674,-2.422960,-1.382643,-2.681801,-1.539816,-2.595265,-1.127857,-2.148006,-2.290481,-2.639456,-1.717789,-0.977396,-1.692599,-2.520733,-1.749358,-2.967128


#### Select train and validation gene meta information:

In [9]:
train_gene_meta = gene_meta_info.loc[train_genes]
val_gene_meta = gene_meta_info.loc[val_genes]
train_gene_meta

Unnamed: 0_level_0,gene_name,gene_id,chr_hg38,start_hg38,end_hg38,strand_hg38,tss_hg38,chr_hg19,tss_hg19,tss,chr,ensg,strand,pos
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ENSG00000141956,PRDM15,ENSG00000141956,21,41798225,41879482,-,41879482,chr21,43299591.0,43299591,21,ENSG00000141956,-,41879482
ENSG00000141959,PFKL,ENSG00000141959,21,44300051,44327376,+,44300051,chr21,45719934.0,45719934,21,ENSG00000141959,+,44300051
ENSG00000142149,HUNK,ENSG00000142149,21,31873315,32044633,+,31873315,chr21,33245628.0,33245628,21,ENSG00000142149,+,31873315
ENSG00000142156,COL6A1,ENSG00000142156,21,45981737,46005050,+,45981737,chr21,47401651.0,47401651,21,ENSG00000142156,+,45981737
ENSG00000142166,IFNAR1,ENSG00000142166,21,33324477,33359862,+,33324477,chr21,34696782.0,34696782,21,ENSG00000142166,+,33324477
ENSG00000142168,SOD1,ENSG00000142168,21,31659622,31668931,+,31659622,chr21,33031935.0,33031935,21,ENSG00000142168,+,31659622
ENSG00000142173,COL6A2,ENSG00000142173,21,46098097,46132849,+,46098097,chr21,47518011.0,47518011,21,ENSG00000142173,+,46098097
ENSG00000142178,SIK1,ENSG00000142178,21,43414515,43427128,-,43427128,chr21,44847008.0,44847008,21,ENSG00000142178,-,43427128
ENSG00000142185,TRPM2,ENSG00000142185,21,44350163,44443081,+,44350163,chr21,45770046.0,45770046,21,ENSG00000142185,+,44350163
ENSG00000142188,TMEM50B,ENSG00000142188,21,33432485,33480011,-,33480011,chr21,34852318.0,34852318,21,ENSG00000142188,-,33480011


#### Initialize training and validation PersonalGenomeDatasets. For more information on how to adjust the parameters to PersonalGenomeDataset to suit your needs, see https://github.com/mostafavilabuw/SAGEnet/blob/main/example_usage.ipynb and the class documentation. 

In [10]:
train_dataset = SAGEnet.data.PersonalGenomeDataset(metadata=train_gene_meta, vcf_file_path=vcf_file_path, hg38_file_path=hg19_path, sample_list=train_samps, y_data=expr_df,contig_prefix='',verbose=False)
val_subs_dataset = SAGEnet.data.PersonalGenomeDataset(metadata=train_gene_meta, vcf_file_path=vcf_file_path, hg38_file_path=hg19_path, sample_list=train_samps, y_data=expr_df,contig_prefix='',verbose=False)
val_genes_dataset = SAGEnet.data.PersonalGenomeDataset(metadata=val_gene_meta, vcf_file_path=vcf_file_path, hg38_file_path=hg19_path, sample_list=val_samps, y_data=expr_df,contig_prefix='',verbose=False)

train_dataloader = DataLoader(train_dataset,shuffle=True,num_workers=1)
val_subs_dataloader = DataLoader(val_subs_dataset, shuffle=False,num_workers=1)
val_genes_dataloader = DataLoader(val_genes_dataset,shuffle=False,num_workers=1)

acceptable maf range: -1<maf<2
avg is mean
acceptable maf range: -1<maf<2
avg is mean
acceptable maf range: -1<maf<2
avg is mean


#### Initialize a p-SAGE-net model. For more information on how to adjust the parameters to PersonalGenomeDataset to suit your needs, see https://github.com/mostafavilabuw/SAGEnet/blob/main/example_usage.ipynb and the class documentation.

In [11]:
my_model = pSAGEnet(model_save_dir=model_save_dir)

#### Set up for model training:

In [12]:
val_dataloaders=[val_subs_dataloader,val_genes_dataloader]
es = EarlyStopping(monitor="train_gene_val_sub_diff_loss/dataloader_idx_0", patience=5,mode='min')
lr_monitor = LearningRateMonitor(logging_interval='epoch')

# used to save every model epoch 
all_epoch_checkpoint_callback = ModelCheckpoint(
    dirpath=model_save_dir,  
    filename="{epoch}",  
    save_top_k=-1,  
    every_n_epochs=1,
    save_last=False
)

# save last ckpt to be able to resume model training if job is killed 
last_checkpoint_callback = ModelCheckpoint(
    dirpath=model_save_dir,
    filename="last",     
    save_top_k=0,        
    every_n_train_steps=300,  
    save_last=True      
)

ckpt_list = [all_epoch_checkpoint_callback,last_checkpoint_callback]

#### Set up a trainer: 

In [13]:
wandb_logger = WandbLogger(project='test_project_name', name='test_job_name', id='test_job_name', resume="allow") # change these based on your logging preferences 
device=2 # indicates which GPU
num_nodes=1 # single node training 
max_epochs=1 

trainer = pl.Trainer(
    accelerator="gpu", 
    devices=[int(device)] if device else 1, 
    num_nodes=num_nodes, 
    strategy="ddp" if not device else 'auto', 
    callbacks=ckpt_list, 
    max_epochs=max_epochs, 
    benchmark=False, 
    profiler='simple', 
    gradient_clip_val=1, 
    logger=wandb_logger, 
    log_every_n_steps=10)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


#### Train! 

In [14]:
trainer.fit(my_model, train_dataloader, val_dataloaders=val_dataloaders)

You are using a CUDA device ('NVIDIA RTX A4000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /homes/gws/aspiro17/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mspiroannae[0m ([33mspiroannae-university-of-washington[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


/homes/gws/aspiro17/miniconda3/envs/SAGEnet/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:658: Checkpoint directory /homes/gws/aspiro17/SAGEnet/example_notebooks/data/mostafavilab/personal_genome_expr/revisions/GEUVADIS/res exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name               | Type       | Params | Mode 
----------------------------------------------------------
0 | conv0              | Sequential | 36.9 K | train
1 | convlayers         | ModuleList | 2.5 M  | train
2 | dilated_convlayers | ModuleList | 0      | train
3 | fc0                | Sequential | 65.8 K | train
4 | fclayers           | ModuleList | 65.8 K | train
5 | diff_fclayers      | ModuleList | 65.8 K | train
6 | diff_out           | Sequential | 257    | train
7 | ref_out            | Sequential | 257    | train
----------------------------------------------------------
2.7 M     Trainable params
0         Non-trainable params
2.7 M     Total pa

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/homes/gws/aspiro17/miniconda3/envs/SAGEnet/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
[W::vcf_parse_format] FORMAT 'PP' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:41859552 is not defined in the header, assuming Type=String
  return F.conv1d(
[W::vcf_parse_format] FORMAT 'PP' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:41859552

Training: |          | 0/? [00:00<?, ?it/s]

[W::vcf_parse_format] FORMAT 'PP' at 21:31853530 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:31853530 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:26151131 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:26151131 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:43407150 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:43407150 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:43407150 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:43407150 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:31853530 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:31853530 is not defined in the header, assuming Type=String


Validation: |          | 0/? [00:00<?, ?it/s]

[W::vcf_parse_format] FORMAT 'PP' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'PP' at 21:41859552 is not defined in the header, assuming Type=String
[W::vcf_parse_format] FORMAT 'BD' at 21:41859552 is not defined in the header, assuming Type=String
