# Goal

Parse the CSQ field from VEP-annotated VCF files into individual annotation dictionaries for easy downstream search and analysis.

References:
- [Parsing the VEP/LoF VCF](https://github.com/konradjk/loftee)
- [read_vep_vcf.py example](https://github.com/konradjk/loftee/blob/master/src/read_vep_vcf.py)

In [None]:
# Import required libraries
import argparse
import gzip
import re
import sys
import pandas as pd

In [None]:
# Set VCF file path (update as needed)
vcf = '/path/to/input_vep_annotated.vcf'

In [None]:
# Open VCF file (supports .gz and uncompressed)
f = gzip.open(vcf) if vcf.endswith('.gz') else open(vcf)

In [None]:
# Read all lines from the VCF file
lines = f.readlines()

In [None]:
# Parse VEP header and extract annotation fields
vep_field_names = None
header = None
annotations_list = []

for line in lines:
    line = line.strip()
    if line.startswith('#'):
        line = line.lstrip('#')
        if 'ID=CSQ' in line:
            vep_field_names = line.split('Format: ')[-1].strip('">').split('|')
        if line.startswith('CHROM'):
            header = line.split()
            header = dict(zip(header, range(len(header))))
        continue

    if vep_field_names is None:
        sys.stderr.write("VCF file does not have a VEP header line. Exiting.\n")
        sys.exit(1)
    if header is None:
        sys.stderr.write("VCF file does not have a header line (CHROM POS etc.). Exiting.\n")
        sys.exit(1)

    # Extract annotation info from INFO and ALT fields
    fields = line.split('\t')
    info_field = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', fields[header['INFO']])])

    # Only process lines with CSQ annotation
    if 'CSQ' not in info_field:
        continue
    annotations = [dict(zip(vep_field_names, x.split('|'))) for x in info_field['CSQ'].split(',') if len(vep_field_names) == len(x.split('|'))]
    annotations_list.append(annotations)

In [None]:
# Extract LoF annotation for each variant line
LoF = [','.join(set(str(i.get('LoF')) for i in a_list)) for a_list in annotations_list]
print(len(LoF))

In [None]:
# Create a DataFrame with chrom, pos, id, ref, alt, and LoF annotation
vcf_pos = pd.read_csv(vcf, comment='#', delimiter='\t', header=None)
vcf_pos_only = vcf_pos.iloc[:, :5]
vcf_pos_only.columns = ['chrom', 'pos', 'ID', 'ref', 'alt']
vcf_pos_only['LoF_val'] = LoF

# Export the DataFrame to a tab-delimited file
vcf_out = '/path/to/output_vep_annot_cleaned.txt'
vcf_pos_only.to_csv(vcf_out, sep='\t', index=False)