# VCF -> VRS ID -> Clinical Evidence

## Setup Dependencies

In [None]:
%%capture
!pip install seqrepo ga4gh.vrs[extras]==2.0.0a3 ga4gh.vrs
%pip install --upgrade --no-cache-dir terra-notebook-utils

In [None]:
%load_ext autoreload
%autoreload 2

from datetime import datetime
from firecloud import api as fapi
from ga4gh.core import ga4gh_identify
from ga4gh.vrs import models
from ga4gh.vrs.extras.vcf_annotation import VCFAnnotator
from pathlib import Path
from pprint import pprint
from terra_notebook_utils import drs
from time import time 

import ast
import datetime
import glob
import io
import json
import logging
import multiprocessing
import os 
import pandas as pd
import pickle
import pysam
import requests
import subprocess
import sys
import vcf
import zipfile

# repo utils
sys.path.append('../../vrs-python-testing')
from utils import get_num_variants, metakb, print_dict, print_percent, truncate, unpickle

In [None]:
# store relevant variables

%env SEQREPO_ROOT=/home/jupyter/seqrepo
%env VCFTOOLS_DIR=/home/jupyter/vcftools
%env BCF_TOOLS_DIR=/home/jupyter/bcftools
%env PERL5LIB=/home/jupyter/vcftools/src/perl/
%env VCFTOOLS=/home/jupyter/vcftools/src/cpp/vcftools
%env OUTPUT=/home/jupyter/output
%env SPLIT_DIR=/home/jupyter/split
%env INPUT_DIR=/home/jupyter/vcf
!mkdir $INPUT_DIR
!mkdir $SPLIT_DIR
!mkdir $OUTPUT

SEQREPO_DIR = os.environ["SEQREPO_ROOT"]+"/latest"

In [None]:
# install vcftools and complete setup
# don't worry about the pyvcf error 

!bash ~/vrs-python-testing/terra/setup.sh

## Get [1000G](https://anvil.terra.bio/#workspaces/anvil-datastorage/AnVIL_1000G_PRIMED-data-model/data) VCF Data for NA12878

In [None]:
# specify patient and chromosomes

chrs_of_interest = [str(num) for num in range(1,2)]
chr_set = set(chrs_of_interest)
patient = "NA12878"
chrs_of_interest

In [None]:
# get metadata for filepaths
# openly sourced from https://anvil.terra.bio/#workspaces/anvil-datastorage/AnVIL_1000G_PRIMED-data-model/data

df = pd.read_csv(io.StringIO(fapi.get_entities_tsv("anvil-datastorage", \
                "AnVIL_1000G_PRIMED-data-model", "sequencing_file", model="flexible").text), sep='\t')
df.head()

In [None]:
# get rid of gvcf data

df_vcf = df[df['file_type'].isin(['VCF', 'VCF index'])]
df_1kgp = df_vcf[df_vcf['file_path'].str.contains('1kGP')]

num_vcf_idx_files = sum(df_1kgp['file_type'] == 'VCF index')
num_vcf_files = sum(df_1kgp['file_type'] == 'VCF')
assert num_vcf_files == 23 and num_vcf_idx_files == 23, \
    f"check number of files, {num_vcf_files} vcfs and {num_vcf_idx_files} index files"

In [None]:
# load 1000G files if doesn't exist

df_chrs = df_1kgp[df_1kgp['chromosome'].isin(chr_set)]
assert(len(df_chrs) == 2*len(chr_set)), \
    f"Expected 2 files per chr but {len(df_chrs)} files and {len(chr_set)} chrs"

uris = df_chrs['file_path']
file_names = [uri.split("/")[-1] for uri in uris]

for file_name, uri in zip(file_names, uris):
    if os.path.exists(f"{os.environ['INPUT_DIR']}/{file_name}"):
        print(f"{truncate(file_name, 35, 10)} already exists, not downloading")
    else:
        split_vcf_cmd = f"gsutil -u $GOOGLE_PROJECT cp {uri} $INPUT_DIR/"
        output = subprocess.run(split_vcf_cmd, shell=True, check=True)

In [None]:
raw_vcfs = [f"{os.environ['INPUT_DIR']}/{fname}" for fname in file_names if ".tbi" not in fname]

print("all files...")
for input_vcf in raw_vcfs:
    print(input_vcf)
    assert os.path.exists(input_vcf), "file doesn't exist"

In [None]:
# define file names
patient_stem_by_chr = [f"{os.environ['SPLIT_DIR']}/{patient}.chr{c}" for c in chrs_of_interest]
patient_unfiltered_vcfs = [f"{stem}.recode.vcf" for stem in patient_stem_by_chr]
patient_vcfs = [f"{stem}.filtered.vcf" for stem in patient_stem_by_chr]

In [None]:
# get patient-level for single chr
for i, c in enumerate(chrs_of_interest):
    raw_vcf = raw_vcfs[i]
    unfiltered_vcf = patient_unfiltered_vcfs[i]
    filtered_vcf = patient_vcfs[i]

    # split vcf by patient
    if os.path.exists(unfiltered_vcf):
        print(f"already split file: {unfiltered_vcf}")
    else:
        # why is this taking 15 minutes
        split_vcf_cmd = f"bcftools view -s {patient} {raw_vcf}" \
                        f" > {unfiltered_vcf}"

        output = subprocess.run(split_vcf_cmd, shell=True, check=True)
        
    # filter to only relevant genotypes
    if os.path.exists(filtered_vcf):
        print(f"already filtered file: {filtered_vcf}")
    else:
        filter_genotypes_cmd = f'bcftools view -e \'GT="0|0"\'' \
                        f"{unfiltered_vcf} > {filtered_vcf}"
        print(filter_genotypes_cmd)
        subprocess.run(filter_genotypes_cmd, shell=True, check=True)

In [None]:
# # OPTIONAL
# # filter to first num_lines

# num_lines = 1000
# head_vcf = f"{patient_path_stem}.{num_lines}.vcf"

# head_cmd = f"cat {filtered_patient_vcf_path} | head -n {num_lines} > {head_vcf}"
# output = subprocess.run(head_cmd, shell=True, check=True)

In [None]:
# !find ~ -name *1000.vcf

In [None]:
# checking my work, make sure we've filtered without losing data

for input_vcf, output_vcf, c in zip(patient_unfiltered_vcfs, patient_vcfs, chrs_of_interest):
    print(c,"~")
    rows = 0
    
    search_terms = ["0|1", "1|0", "1|1", "0|0", "^#"]
    for term in search_terms:
        numRows = int(subprocess.run(f"grep '{term}' {input_vcf} | wc -l", \
                      shell=True, check=True, stdout=subprocess.PIPE, text=True).stdout)
        print(numRows)
        rows += numRows
        
    expected_rows = int(subprocess.run(f"grep '.' {input_vcf} | wc -l", \
               shell=True, check=True, stdout=subprocess.PIPE, text=True).stdout)
    assert rows == expected_rows, f"chr{c}: rows {rows} not same as expected {expected_rows}"
        

## Get VRS Allele Objects

In [None]:
def annotate_vcf(input_vcf, output_vcf, output_pkl, seqrepo_root_dir, require_validation=True, rle_seq_limit=50):
    '''param stem: path of input vcf file'''
    vcf_annotator = VCFAnnotator(seqrepo_root_dir=seqrepo_root_dir)
    vcf_annotator.tlr.rle_seq_limit = rle_seq_limit
    vcf_annotator.annotate(vcf_in=input_vcf, vcf_out=output_vcf, \
        vrs_pickle_out=output_pkl, require_validation=require_validation)
    
    return output_vcf, output_pkl

# create annotated vcf test file 
for curr_input_vcf in patient_vcfs:
    stem = curr_input_vcf.replace('.vcf', '')
    output_vcf = f"{stem}.vcf.gz"
    output_pkl = f"{stem}-vrs-objects.pkl"

    # write to file if doesn't exist
    if os.path.exists(output_pkl):
        print("output files already exists:")
#         print(f" -- {output_vcf}")
        print(f" -- {output_pkl}")
        continue

    print("writing to...")
    print(output_vcf)
    print(output_pkl)

    t = time()
    annotate_vcf(curr_input_vcf, output_vcf, output_pkl, SEQREPO_DIR)
    elapsed_time = time()-t
    print(f"annotation: {(elapsed_time):.2f}s")

## Query for MetaKB Evidence

In [None]:
# load in metakb data locally

METAKB_DIR = f"{os.environ['HOME']}/metakb"
Path(METAKB_DIR).mkdir(exist_ok=True)

json_files = ["civic_cdm_20240103.json", "moa_cdm_20240103.json"]
json_paths = [f"{METAKB_DIR}/{json_file}" for json_file in json_files]

for json_file, json_path in zip(json_files, json_paths):
    if os.path.exists(json_path):
        print(f"{json_file} already exists...")
        continue
    
    url = f"https://vicc-metakb.s3.us-east-2.amazonaws.com/cdm/20240103/{json_file}.zip"
    zip_path = f"{json_path}.zip"
    zip_path_wrapped = Path(zip_path)
    
    download_s3(url, zip_path)
    
    print(zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(f"{METAKB_DIR}")
    os.remove(zip_path)

In [None]:
# create metakb cache of vrs ids

metakb_cache = set()

def metakb_ids_from_json(path):
    allele_str = "ga4gh:VA"
    ids = []
    
    with open(path) as file:
        data = json.load(file)
    
    # studies
    for study in data['studies']:
        variant = study['variant']
        
        if 'members' in variant:
            for member in variant['members']:
                if allele_str not in member['id']:
                    print(member['id'])
            ids.extend([member['id'] for member in variant['members']])
            
        if 'definingContext' in variant:
            if allele_str not in variant['definingContext']['id']:
                print(variant['definingContext']['id'])
            ids.append(variant['definingContext']['id'])
    
    # molecular_profiles
    for profile in data['molecular_profiles']:
        if 'members' in profile:
            for member in profile['members']:
                if allele_str not in member['id']:
                    print(member['id'])
            ids.extend([member['id'] for member in variant['members']])
            
        if 'definingContext' in profile:
            if allele_str not in profile['definingContext']['id']:
                print(profile['definingContext']['id'])
            ids.append(profile['definingContext']['id'])
    
    # variations
    ids.extend([variation['id'] for variation in data['variations'] \
                if allele_str in variation['id']])
    
    for variation in data['variations']:
        if 'definingContext' in variation:
            new_id = variation['definingContext']['id']
            if allele_str in new_id:
                ids.append(new_id)
        
    return ids

# add from each CDM (json)
t = time()
for path in json_paths:
    ids = metakb_ids_from_json(path)
    print(f"{path}: {len(ids)} ids")
    metakb_cache.update(ids)
    
print(f"cache size: {len(metakb_cache)} ids")
print(f"examples: {list(metakb_cache)[:3]}")

In [None]:
# look for metakb matches

id_start = 0
id_end = None

for i, curr_input_vcf in enumerate(patient_vcfs):
    print(f"trying chr{chrs_of_interest[i]}...")
    
    # initial file names
    stem = curr_input_vcf.replace('.vcf', '')
    output_vcf = f"{stem}.vcf.gz"
    output_pkl = f"{stem}-vrs-objects.pkl"
    metakb_output_pkl = f"{stem}-hits.pkl"

    # get ratio of alleles to variants
    t = time()
    allele_dicts = unpickle(output_pkl) # TODO: returns generator
    num_alleles = sum(1 for _ in unpickle(output_pkl))
    num_variants = get_num_variants(curr_input_vcf)
    if id_end is None: id_end = num_alleles
        
    print(f"num_alleles (ref and alt) to num_variants:", end=" ")
    print_percent(num_alleles, num_variants)
    print(f"get percentage: {(time()-t):.2f} s")

    print(f"writing to {metakb_output_pkl}")

    # convert alleles to vrs ids
    t = time()
    vrs_ids = [ga4gh_identify(models.Allele(**ast.literal_eval(allele_dict))) \
                for i, (_, allele_dict) in enumerate(allele_dicts) \
                if i >= id_start and i < id_end]
    print(f"{id_end-id_start} ids: {(time()-t):.2f} s")

    # ping metakb if cache hit
    print("pinging metakb...")
    t = time()
    id_hits = [vrs_id for vrs_id in vrs_ids if vrs_id in metakb_cache]
    #     hits = [metakb(vrs_id) for vrs_id in id_hits]
    print(f"metakb: {(time()-t):.2f} s")

    with open(metakb_output_pkl, 'wb') as file:
        pickle.dump(id_hits, file)

    print("id hits", id_hits)
    print("\nhits to ids queried...")
    # total = num_ids_limit if num_ids_limit else len(vrs_ids)
    total = id_end - id_start
    print_percent(len(id_hits), total)

In [None]:
# get evidence about the id and save all to pickle
# TODO: replace it with metakb results

def find_keys_from_ids(data, target="ga4gh:VA", prefix="", result=None):
    if result is None:
        result = set()
        
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                find_keys_from_ids(value, target, new_prefix, result)
            elif isinstance(value, str) and target in value:
                result.add(new_prefix)
    elif isinstance(data, list):
        for index, value in enumerate(data):
            new_prefix = f"{prefix}.{index}"
            if isinstance(value, (dict, list)):
                find_keys_from_ids(value, target, new_prefix, result)
            elif isinstance(value, str) and target in value:
                result.add(new_prefix)
    return result

# get evidence key drill down for each ting

metakb_dict = {}
for vrs_id in id_hits:
    all_evidence_keys = []
    for json_path in json_paths:
        with open(json_path) as file:
            data = json.load(file)
        
        print(json_path)
        associated_keys = find_keys_from_ids(data, vrs_id)
        all_evidence_keys.extend(associated_keys)
        print(associated_keys, "\n")
    
    metakb_dict[vrs_id] = all_evidence_keys
        
with open(metakb_output_pkl, 'wb') as file:
    pickle.dump(metakb_dict, file)

with open(metakb_output_pkl, 'rb') as file:
    print("final metakb_dict...")
    pprint(pickle.load(file))

In [None]:
# example for using keys with chr

with open(json_paths[0]) as file:
    data = json.load(file)

# pretty_print_json_tree(data["studies"][45], print_values=True)
for num in [38, 45, 71]:
    study = data["studies"]
    print(f"{study[num]['id']}: {study[num]['description']} \n")
    
print(f'Molecular profile aliases: {data["molecular_profiles"][20]["aliases"]}')
print(f'Molecular profile aliases: {data["molecular_profiles"][31]["aliases"]}')

In [None]:
# # initial file names
# curr_input_vcf = filtered_patient_vcf_path
# stem = curr_input_vcf.replace('.vcf', '')
# output_vcf = f"{stem}.vcf.gz"
# output_pkl = f"{stem}-vrs-objects.pkl"

# # get total num_variants
# t = time()
# allele_dicts = unpickle(output_pkl)
# num_variants = get_num_variants(curr_input_vcf)
# print(f'num_vrs_objects to num_variants: {len(allele_dicts)}/{num_variants}={100*(len(allele_dicts)/num_variants):.2f}%')
# print(f"get total: {(time()-t):.2f} s")



# # set number of ids to process
# id_start = 20_000
# id_end = 100_000 # ids to process
# progress_interval = 10_000
# metakb_output_pkl = f"{patient_path_stem}-{id_start}-to-{id_end}-hits.pkl"

# print(f"writing to {metakb_output_pkl}")

# # convert alleles to vrs ids
# t = time()
# vrs_ids = [ga4gh_identify(models.Allele(**allele_dict)) \
#             for i, (_, allele_dict) in enumerate(allele_dicts.items()) \
#             if i >= id_start and i < id_end]
# print(f"{id_end-id_start} ids: {(time()-t):.2f} s")

# # number of workers
# worker_count = 4 * os.cpu_count()

# # ping metakb
# print("pinging metakb...")
# t = time()
# hits = parallelize(metakb, vrs_ids, worker_count=worker_count, \
#     progress_interval=progress_interval)
# print(f"metakb: {(time()-t):.2f} s")

# with open(metakb_output_pkl, 'wb') as file:
#     pickle.dump(hits, file)

# print("\nhits to ids queried...")
# # total = num_ids_limit if num_ids_limit else len(vrs_ids)
# total = id_end - id_start
# print_percent(len(hits), total)

## Misc Ways to look at CDM data

In [None]:
def pretty_print_json_tree(data, max_depth=None, current_depth=0, indent=0, print_values=False):
    if max_depth is not None and current_depth > max_depth:
        return

    if isinstance(data, dict):
        for key, value in data.items():
            print("  " * indent + str(key))
            pretty_print_json_tree(value, max_depth, current_depth + 1, indent + 1, print_values)
    elif isinstance(data, list):
        if data:  # Check if the list is not empty
            pretty_print_json_tree(data[0], max_depth, current_depth, indent, print_values)
    elif print_values:
            print("  " * indent + str(data))


with open(json_paths[0]) as file:
    data = json.load(file)
    
pretty_print_json_tree(data)

In [None]:
def find_keys_with_partial_match(data, target, prefix="", result=None):
    if result is None:
        result = set()
        
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                find_keys_with_partial_match(value, target, new_prefix, result)
            elif isinstance(value, str) and target in value:
                result.add(new_prefix)
    elif isinstance(data, list):
        for value in data:
            if isinstance(value, (dict, list)):
                find_keys_with_partial_match(value, target, prefix, result)
            elif isinstance(value, str) and target in value:
                result.add(prefix)
    
    return result


target_value = "ga4gh:VA"

for path in json_paths:
    with open(path, 'r') as file:
        data = json.load(file)
        
    keys_with_value = find_keys_with_partial_match(data, target_value)
    print(path)
    print(keys_with_value, "\n")

## Split before annotate

In [None]:
! (seq 1 22; echo X; echo Y) | xargs -P 0 -I PATH $VCFTOOLS --recode --vcf "/home/jupyter/vcf/1KGP_haplotype_caller_NA12878.chr10.hc.vcf" --chr chrPATH --out $SPLIT_DIR/chrPATH

In [None]:
vcf_path = drs_vcfs[0]

! rm -r $SPLIT_DIR
split_vcf_cmd = f"(seq 1 22; echo X; echo Y) | \
               xargs -P 0 -I PATH $VCFTOOLS --recode --gzvcf {vcf_path} \
               --chr chrPATH --out $SPLIT_DIR/chrPATH"

output = subprocess.run(split_vcf_cmd, shell=True, check=True)
# output = subprocess.run(split_vcf_cmd, shell=True, check=True, \
#                         capture_output=True, text=True) 

# no chr prefix
# ! (seq 1 22; echo X; echo Y) | xargs -P 0 -I PATH ~/vcftools-vcftools-d511f46/src/cpp/vcftools --recode --vcf $VCF_PATH --chr PATH --out ~/split/chrPATH

In [None]:
# TODO: parse logs to get outputs on how many were filtered out
# get total num_variants

def get_num_variants(path):
    vcf_reader = pysam.VariantFile(open(path, 'r'))
    return sum(1 for record in vcf_reader)

split_vcf_paths = glob.glob(f"{os.environ.get('SPLIT_DIR')}/*.recode.vcf")
     
input_num_variants = get_num_variants(vcf_path[:-3])
split_num_variants = sum(get_num_variants(path) for path in split_vcf_paths)

print(f"{split_num_variants}/{input_num_variants} = ", \
      f"{100*split_num_variants/input_num_variants:.2f}% kept")

In [None]:
ls -l $SPLIT_DIR/*.recode.vcf | wc -l

In [None]:
# annotate each of them
# TODO: fix the outputs coming from this

! (ls -1 $SPLIT_DIR/*.recode.vcf | \
   xargs -P 0 -I PATH python3 -m ga4gh.vrs.extras.vcf_annotation \
   --vcf_in PATH --vcf_out PATH.vcf.gz --vrs_pickle_out PATH.pkl \
   --seqrepo_root_dir $SEQREPO_ROOT/latest \
   2> $SPLIT_DIR/chrPATH_log.txt)

# # GREGoR
# !python3 -m ga4gh.vrs.extras.vcf_annotation --vcf_in 1369747.merged.matefixed.sorted.markeddups.recal.g.vcf  --vcf_out 1369747.merged.matefixed.sorted.markeddups.recal.g.vcf.output.vcf.gz --vrs_pickle_out 1369747.merged.matefixed.sorted.markeddups.recal.g.vcf.vrs_objects.pkl  --seqrepo_root_dir ~/seqrepo/latest/

In [None]:
!ls -l $SPLIT_DIR/*.vcf.vcf.gz | wc -l
!ls -l $SPLIT_DIR/*.vcf.pkl | wc -l

# assert (!ls -l ~/split/*.vcf.vcf.gz | wc -l) == 24, "incorrect number of output vcf.gz files created"
# assert (!ls -l ~/split/*.vcf.pkl | wc -l) == 24, "incorrect number of outputted pickle files"

In [None]:
# join the files
!ls -1 $SPLIT_DIR/*.vcf.vcf.gz | xargs $PERL5LIB/vcf-concat > $OUTPUT/merged_output.vcf

In [None]:
!ls $OUTPUT_DIR

In [None]:
# TODO: remove the pair of them

### Random python annotate

In [None]:
import logging

logger = logging.getLogger("ga4gh.vrs.extras.vcf_annotation")
logger.setLevel(level=logging.INFO)

# create annotated vcf test file 
def annotate_vcf(path):
    '''param stem: path of input vcf file'''
    stem = path.replace(".vcf", "")
    
    input_vcf = path
    output_vcf = f"{stem}.output.vcf.gz"
    output_pkl = f"{stem}-vrs-objects.pkl"

    
    vcf_annotator = VCFAnnotator(seqrepo_root_dir="/home/jupyter/seqrepo/latest")
    vcf_annotator.annotate(vcf_in=input_vcf, vcf_out=output_vcf, vrs_pickle_out=output_pkl)
    # vcf_annotator.annotate(vcf_in=input_vcf, vrs_pickle_out=output_pkl)
    
# annotate_vcf("/home/jupyter/split", "chr1.recode")
successes = set()
for vcf_path in drs_vcfs:
    try:
        print("trying...", vcf_path)
        annotate_vcf(vcf_path)
        print("worked \n")
        successes.add(vcf_path)
    except Exception as e:
        print(e)
        print("unsucessful, see logs above \n")

print(f"total successes: {len(successes)}/{len(drs_vcfs)} \nList...")
for vcf_path in drs_vcfs:
    print(f"{vcf_path}: {'✓' if vcf_path in successes else 'x'}")

In [None]:
# annotate w vrs id asking for output vcf

import logging

logger = logging.getLogger("ga4gh.vrs.extras.vcf_annotation")
logger.setLevel(level=logging.INFO)

# create annotated vcf test file 
def annotate_vcf(path):
    '''param stem: path of input vcf file'''
    stem = path.replace(".vcf", "")
    
    input_vcf = path
    output_vcf = f"{stem}.output.vcf.gz"
    output_pkl = f"{stem}-vrs-objects.pkl"

    
    vcf_annotator = VCFAnnotator(seqrepo_root_dir="/home/jupyter/seqrepo/latest")
    vcf_annotator.annotate(vcf_in=input_vcf, vcf_out=output_vcf, vrs_pickle_out=output_pkl)
    # vcf_annotator.annotate(vcf_in=input_vcf, vrs_pickle_out=output_pkl)
    
# annotate_vcf("/home/jupyter/split", "chr1.recode")
successes = set()
for vcf_path in drs_vcfs:
    try:
        print("trying...", vcf_path)
        annotate_vcf(vcf_path)
        print("worked \n")
        successes.add(vcf_path)
    except Exception as e:
        print(e)
        print("unsucessful, see logs above \n")

print(f"total successes: {len(successes)}/{len(drs_vcfs)} \nList...")
for vcf_path in drs_vcfs:
    print(f"{vcf_path}: {'✓' if vcf_path in successes else 'x'}")

In [None]:
for vcf_path in drs_vcfs:
    if "HG02080vCHM13_20200921" in vcf_path:
        print(vcf_path)
    else:
        continue
#     if "chm13_hifi_HG007" in vcf_path:
#         print("trying...", vcf_path)
#         annotate_vcf(vcf_path)
#         print("worked \n")
    try:
        print("trying...", vcf_path)
        annotate_vcf(vcf_path)
        print("worked \n")
        successes.add(vcf_path)
    except Exception as e:
        print(e)
        print("unsucessful, see logs above \n")

In [None]:
# annotate w vrs id only pickle outputted

logger = logging.getLogger("ga4gh.vrs.extras.vcf_annotation")
# logger.setLevel(level=logging.ERROR)
logger.disabled = True

# create annotated vcf test file 
def annotate_vcf_pkl_only(path):
    '''param stem: path of input vcf file'''
    stem = path.replace(".vcf", "")
    
    input_vcf = path
    output_vcf = f"{stem}.output.vcf.gz"
    output_pkl = f"{stem}-vrs-objects.pkl"

    
    vcf_annotator = VCFAnnotator(seqrepo_root_dir="/home/jupyter/seqrepo/latest")
    vcf_annotator.annotate(vcf_in=input_vcf, vrs_pickle_out=output_pkl)
    
successes = set()
for i, vcf_path in enumerate(drs_vcfs):
    print("starting... \n")
    # annotate to output pkl
    try:
        print("trying...", vcf_path)
        annotate_vcf_pkl_only(vcf_path)
        print("worked \n")
        successes.add(vcf_path)
    except Exception as e:
        print(e)
        print("unsucessful, see logs above \n")
    
    # get pickle totals
    try:
        with open(output_pkl, 'rb') as f:
            vrs_objects = pickle.load(f)

        # get total num_variants
        vcf_reader = vcf.Reader(open(vcf_path, 'r'))
        num_variants = sum(1 for record in vcf_reader)

        # view details
        print(f'num_vrs_objects to num_varaints: {len(vrs_objects)}/{num_variants}={(len(vrs_objects)/num_variants):.2f}%')
    except:
        print("unable to get pickle totals, file may not exist")
    print()

print(f"total successes: {len(successes)}/{len(drs_vcfs)} \nList...")
for vcf_path in drs_vcfs:
    print(f"{vcf_path}: {'✓' if vcf_path in successes else 'x'}")

In [None]:
import vcf


# for input_vcf_file in ["/home/jupyter/vcf/long_read_sv_jasmine_Trios_IndividualCallsets_CHM13_HG005_Trio_HG006vCHM13_20200921_mm2_PBCCS_sniffles.s2l20.refined.nSVtypes.ism.vcf"]:
for input_vcf_file in ["/home/jupyter/vcf/long_read_minimap2_alignments_HG02080vCHM13_20200921_mm2_ONT_sniffles.s2l20.refined.nSVtypes.ism.vcf"]:
    output_vcf_file = "/home/jupyter/vcf/long_read.test.vcf"

    vcf_reader = vcf.Reader(open(input_vcf_file, 'r'))
    vcf_writer = vcf.Writer(open(output_vcf_file, 'w'), vcf_reader)

    for record in vcf_reader:
        record.INFO['VRS_ALLELE_ID'] = 'ga4gh:VA.xksahgfowdfdwofd,ga4gh:VA.xksahgfowdfdwofd'
        vcf_writer.write_record(record)

vcf_writer.close()

### show loaded files

In [None]:
# from pprint import pprint
# import pickle
# import ast
# import requests
# import datetime

# # log progress
# progress_interval = 50000

# # load pickled dict
# with open(output_pkl, 'rb') as f:
#     print(datetime.datetime.now().isoformat(), 'opened pickle')
#     vrs_objects = pickle.load(f)
#     c = 0
#     for k, v in vrs_objects.items():
#         vrs_objects[k] = ast.literal_eval(v)
#         c += 1
#         if c % progress_interval == 0:
#             print(datetime.datetime.now().isoformat(), c)

# # view details        
# print('number of vrs objects', len(vrs_objects))

In [None]:
pickle_paths = !ls -1 ~/split/*.vcf.pkl
pickle_paths

In [None]:
# get percent of loaded variants

# load pickled dict
# for vcf_path in drs_vcfs:

def unpickle_generator(file_name):
    """Unpickle vrs objects, yields (key,vrs_object)"""
    with open(file_name, 'rb') as f:
        vrs_objects = pickle.load(f)
        for k, v in vrs_objects.items():
            yield k, ast.literal_eval(v)
            
def unpickle(file_name):
    """Unpickle vrs objects to single dict"""
    with open(file_name, 'rb') as f:
        vrs_objects = pickle.load(f)
        for k, v in vrs_objects.items():
            vrs_objects[k] = ast.literal_eval(v)
    
    return vrs_objects

vrs_dicts = []

total_num_vrs_objs = 0

for path in pickle_paths:
    vrs_dict = unpickle(path)
    vrs_dicts.append(vrs_dict)

    # get total num_variants
    # TODO: reference the new merged file bc some might have been filtered out
    vcf_reader = vcf.Reader(open(path[:-4], 'r'))
    num_variants = sum(1 for record in vcf_reader)

#     num_vrs_objs = sum((1 for _ in vrs_objects))
    num_vrs_objs = len(vrs_dict)
    total_num_vrs_objs += num_vrs_objs

    # view details
    
    print(path.split("/")[-1], end=" ")
    if num_variants == 0: 
        print(f"no variants") 
    else:
        print(f'vrs_objects:variants = {num_vrs_objs}/{num_variants} = {(50*num_vrs_objs/num_variants):.1f}%')

total_variants = get_num_variants(vcf_path)
        
print(f"Totals: {total_num_vrs_objs}/{total_variants}", \
      f"= {(50*total_num_vrs_objs/total_variants):.2f}%")
        
# TODO on combining: have to think about this more bc large files will have to be held in memory

In [None]:
# error reporting from logs
num_val_errors = !(grep "raise ValidationError(err_msg)" $HOME/log.txt | wc -l)
num_val_errors = int(num_val_errors[0])
print(f"validations errors = {num_val_errors}, ie {50*num_val_errors/total_variants:.1f}%", \
      " if 2:1 VRS ID to variant")

num_invalid_files = !(grep "\[E::vcf_format\] Invalid BCF" $HOME/log.txt | wc -l)
num_invalid_files = int(num_invalid_files[0])
print(f"num invalid files: {num_invalid_files}")