## Collecting data from gnomAD

Variants data downloaded from [gnomAD](https://gnomad.broadinstitute.org/downloads#v4)

Use command line for initial processing

1. Filtered only PASS variants 

```zsh
bcftools view -f 'PASS,.' gnomad.exomes.v4.0.sites.chr22.vcf.bgz > filtered_gnomad22.bgz
```
For next parcing we can use also **command line** or **python**

2. Extract the necessary data

for this step we need **bcftools** utility

```zsh
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%AC\t%AC_afr\t%AC_amr\t%AC_nfe
\t%AC_asj\t%AC_sas\t%AC_eas\t%AC_mid\t%AC_fin\t%AN\t%AN_afr\t%AN_amr\t%AN_nfe\t%AN_asj
\t%AN_sas\t%AN_eas\t%AN_mid\t%AN_fin\t%AF\t%AF_afr\t%AF_amr\t%AF_nfe\t%AF_asj\t%AF_sas
\t%AF_eas\t%AF_mid\t%AF_fin\t%vep=\n' filtered_gnomad22.bgz > processed_gnomad22.vcf.bgz
```


In [2]:
import csv

import cyvcf2
import numpy as np
import pandas as pd

In [7]:
vcf_path = 'data/gnomad.exomes.v4.0.sites.chr22.vcf.bgz'
vcf = cyvcf2.VCF(vcf_path)

# Define the columns to extract
info_fields_to_extract = ['AC', 'AC_afr', 'AC_amr', 'AC_nfe', 'AC_asj', 'AC_sas', 'AC_eas', 'AC_mid', 'AC_fin',
               'AN', 'AN_afr', 'AN_amr', 'AN_nfe', 'AN_asj', 'AN_sas', 'AN_eas', 'AN_mid', 'AN_fin',
               'AF', 'AF_afr', 'AF_amr', 'AF_nfe', 'AF_asj', 'AF_sas', 'AF_eas', 'AF_mid', 'AF_fin', 'vep']
vep_field_mapping = {
        1: 'Consequence', 3: 'SYMBOL',
        5: 'Feature_Type', 6: 'Feature', 7: 'BIOTYPE', 8: 'EXON', 9: 'INTRON',
        17: 'ALLELE_NUM', 24: 'CANONICAL',
        42: 'LoF', 43: 'LoF_filter', 44: 'LoF_flags', 45: 'LoF_info'}

column_names = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'AC', 'AC_afr', 'AC_amr', 'AC_nfe', 'AC_asj', 
                'AC_sas', 'AC_eas', 'AC_mid', 'AC_fin',
                'AN', 'AN_afr', 'AN_amr', 'AN_nfe', 'AN_asj', 'AN_sas', 'AN_eas', 'AN_mid', 'AN_fin',
                'AF', 'AF_afr', 'AF_amr', 'AF_nfe', 'AF_asj', 'AF_sas', 'AF_eas', 'AF_mid', 'AF_fin',
                'Consequence', 'SYMBOL', 'Feature_Type', 'Feature', 'BIOTYPE', 'EXON', 'INTRON', 'ALLELE_NUM',
                'CANONICAL', 'LoF', 'LoF_filter', 'LoF_flags', 'LoF_info']

# Initialize an empty list to store the extracted data
data = []

# Iterate over each variant in the VCF file
counter = 1
for variant in vcf:
    variant_data = [variant.CHROM, variant.POS, variant.ID, variant.REF, variant.ALT[0]]
    info_data = [variant.INFO.get(field, '.') for field in info_fields_to_extract]
    vep_annotation = variant.INFO.get('vep')

    # Handle multiple transcripts in vep if present
    if vep_annotation:
        vep_transcripts = vep_annotation.split(',')
        for transcript in vep_transcripts:
            split_transcript = transcript.split('|')
            vep_fields = []
            for key in vep_field_mapping.keys():
                try:
                    vep_fields.append(split_transcript[key])
                except:
                    vep_fields.append('.')
            data.append(variant_data + info_data[:-1] + vep_fields)
    else:
        data.append(variant_data + info_data + ['.'])
    counter += 1
output_file_name = f"output_{vcf_path.split('.')[-3]}.tsv"
with open(output_file_name, 'w', newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    # Write the header
    writer.writerow(column_names)
    # Write the data rows
    writer.writerows(data)
# Create a pandas DataFrame from the extracted data

KeyboardInterrupt: 

Проверка на тестовых данных

In [7]:
vcf_path = 'data/test.bgz'
vcf = cyvcf2.VCF(vcf_path)

# Define the columns to extract
info_fields_to_extract = ['AC', 'AC_afr', 'AC_amr', 'AC_nfe', 'AC_asj', 'AC_sas', 'AC_eas', 'AC_mid', 'AC_fin',
               'AN', 'AN_afr', 'AN_amr', 'AN_nfe', 'AN_asj', 'AN_sas', 'AN_eas', 'AN_mid', 'AN_fin',
               'AF', 'AF_afr', 'AF_amr', 'AF_nfe', 'AF_asj', 'AF_sas', 'AF_eas', 'AF_mid', 'AF_fin', 'vep']
vep_field_mapping = {
        1: 'Consequence', 3: 'SYMBOL',
        5: 'Feature_Type', 6: 'Feature', 7: 'BIOTYPE', 8: 'EXON', 9: 'INTRON',
        17: 'ALLELE_NUM', 21: 'VARIANT_CLASS', 24: 'CANONICAL',
        42: 'LoF', 43: 'LoF_filter', 44: 'LoF_flags', 45: 'LoF_info'}

column_names = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'AC', 'AC_afr', 'AC_amr', 'AC_nfe', 'AC_asj', 
                'AC_sas', 'AC_eas', 'AC_mid', 'AC_fin',
                'AN', 'AN_afr', 'AN_amr', 'AN_nfe', 'AN_asj', 'AN_sas', 'AN_eas', 'AN_mid', 'AN_fin',
                'AF', 'AF_afr', 'AF_amr', 'AF_nfe', 'AF_asj', 'AF_sas', 'AF_eas', 'AF_mid', 'AF_fin',
                'Consequence', 'SYMBOL', 'Feature_Type', 'Feature', 'BIOTYPE', 'EXON', 'INTRON', 'ALLELE_NUM',
                'VARIANT_CLASS', 'CANONICAL', 'LoF', 'LoF_filter', 'LoF_flags', 'LoF_info']

# Initialize an empty list to store the extracted data
data = []

# Iterate over each variant in the VCF file
counter = 1
for variant in vcf:
    variant_data = [variant.CHROM, variant.POS, variant.ID, variant.REF, variant.ALT[0]]
    info_data = [variant.INFO.get(field, '.') for field in info_fields_to_extract]
    vep_annotation = variant.INFO.get('vep')

    # Handle multiple transcripts in vep if present
    if vep_annotation:
        vep_transcripts = vep_annotation.split(',')
        for transcript in vep_transcripts:
            split_transcript = transcript.split('|')
            vep_fields = []
            for key in vep_field_mapping.keys():
                try:
                    vep_fields.append(split_transcript[key])
                except:
                    vep_fields.append('.')
            data.append(variant_data + info_data[:-1] + vep_fields)
    else:
        data.append(variant_data + info_data + ['.'])
    counter += 1
vcf_df = pd.DataFrame(data, columns=column_names)

In [8]:
vcf_df['VARIANT_CLASS'].unique()

array(['SNV', 'insertion', 'deletion', '.'], dtype=object)