## Collecting data from gnomAD

Variants data downloaded from [gnomAD](https://gnomad.broadinstitute.org/downloads#v4)

Use command line for initial processing

1. Filtered only PASS variants 

```zsh
bcftools view -f 'PASS,.' gnomad.exomes.v4.0.sites.chr22.vcf.bgz > filtered_gnomad22.bgz
```
For next parcing we can use also **command line** or **python**

2. Extract the necessary data

for this step we need **bcftools** utility

```zsh
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%AC\t%AC_afr\t%AC_amr\t%AC_nfe
\t%AC_asj\t%AC_sas\t%AC_eas\t%AC_mid\t%AC_fin\t%AN\t%AN_afr\t%AN_amr\t%AN_nfe\t%AN_asj
\t%AN_sas\t%AN_eas\t%AN_mid\t%AN_fin\t%AF\t%AF_afr\t%AF_amr\t%AF_nfe\t%AF_asj\t%AF_sas
\t%AF_eas\t%AF_mid\t%AF_fin\t%vep=\n' filtered_gnomad22.bgz > processed_gnomad22.vcf.bgz
```


In [2]:
import numpy as np
import pandas as pd
import cyvcf2
import csv

In [18]:
# Input VCF.bgz file path
vcf_path = 'data/test.bgz'
output_csv_path = 'gnomad_data.csv'

# Define columns to extract
columns_to_extract = ['CHROM', 'POS', 'ID', 'REF', 'ALT']

# Define INFO fields to extract
info_fields_to_extract = ['AC', 'AC_afr', 'AC_amr', 'AC_nfe', 'AC_asj', 'AC_sas',
                         'AC_eas', 'AC_mid', 'AC_fin', 'AN', 'AN_afr', 'AN_amr', 'AN_nfe', 'AN_asj', 'AN_sas',
                         'AN_eas', 'AN_mid', 'AN_fin', 'AF', 'AF_afr', 'AF_amr', 'AF_nfe',
                         'AF_asj', 'AF_sas', 'AF_eas', 'AF_mid', 'AF_fin']

# Open VCF.bgz file using cyvcf2
vcf_reader = cyvcf2.VCF(vcf_path)

# Extract data from the VCF.bgz file
data = []
for variant in vcf_reader:
    variant_data = [variant.CHROM, variant.POS, variant.ID, variant.REF, variant.ALT[0]]
    info_data = [variant.INFO.get(field, '.') for field in info_fields_to_extract]
    vep_annotation = variant.INFO.get('vep')
    if vep_annotation:
        vep_info_combined = []
        for annotation in vep_annotation.split(','):
            vep_info = annotation.split('|')
            vep_info_combined.append(','.join([vep_info[i] if i < len(vep_info) else '' for i in [3, 1, 5, 7, 8, 9, 42, 43, 44]]))
        data.append(variant_data + info_data + [';'.join(vep_info_combined)])
    else:
        data.append(variant_data + info_data + [''])  # Adding empty values if VEP annotation is not present

# Save the extracted data into a CSV file
with open(output_csv_path, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    header = columns_to_extract + info_fields_to_extract + ['VEP_annotation']
    writer.writerow(header)
    writer.writerows(data)

print(f"Extracted data has been saved to {output_csv_path}")

Extracted data has been saved to gnomad_data.csv


In [19]:
var_data = pd.read_csv('gnomad_data.csv', sep = ',')  
display(var_data.iloc[30]['VEP_annotation'])

'FRG1FP,non_coding_transcript_exon_variant,Transcript,unprocessed_pseudogene,9/9,,,,;FRG1FP,downstream_gene_variant,Transcript,transcribed_pseudogene,,,,,'

In [28]:
var_data_proc = var_data['VEP_annotation'].str.split(';', expand=True).stack().reset_index(level=1, drop=True).to_frame('VEP')
result = var_data.drop('VEP_annotation', axis=1).join(var_data_proc)

new_columns = result['VEP'].str.split(',', expand=True)
result[new_columns.columns] = new_columns

result = result.drop('VEP', axis=1)


result = result.rename(columns={0: 'SYMBOL', 
                                1: 'Consequence', 
                                2: 'Feature_Type', 
                                3: 'BIOTYPE', 
                                4: 'EXON', 
                                5: 'INTRON', 
                                6: 'LoF', 
                                7: 'LoF_filter',
                                8: 'LoF_flags'})

display(result)

Unnamed: 0,CHROM,POS,ID,REF,ALT,AC,AC_afr,AC_amr,AC_nfe,AC_asj,...,AF_fin,SYMBOL,Consequence,Feature_Type,BIOTYPE,EXON,INTRON,LoF,LoF_filter,LoF_flags
0,chr22,10736362,rs879131185,C,T,63,0,1,50,0,...,0.20000000298023224,U2,upstream_gene_variant,Transcript,snRNA,,,,,
1,chr22,10736399,rs1206833908,T,C,442,13,5,298,5,...,0.5,U2,upstream_gene_variant,Transcript,snRNA,,,,,
2,chr22,10736400,rs1289580688,G,A,15,0,0,9,0,...,0.022727299481630325,U2,upstream_gene_variant,Transcript,snRNA,,,,,
3,chr22,10736414,,G,C,1,0,0,1,0,...,0.0,U2,upstream_gene_variant,Transcript,snRNA,,,,,
4,chr22,10736415,,C,T,1,0,0,1,0,...,0.0,U2,upstream_gene_variant,Transcript,snRNA,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2367,chr22,15562681,,C,A,1,0,0,0,0,...,0.0,,non_coding_transcript_exon_variant,Transcript,processed_pseudogene,1/1,,,,
2367,chr22,15562681,,C,A,1,0,0,0,0,...,0.0,,upstream_gene_variant,Transcript,lncRNA,,,,,
2368,chr22,15562681,rs1986857581,C,T,4,0,0,2,0,...,1.9197499568690546e-05,,intron_variant&non_coding_transcript_variant,Transcript,unprocessed_pseudogene,,2/3,,,
2368,chr22,15562681,rs1986857581,C,T,4,0,0,2,0,...,1.9197499568690546e-05,,non_coding_transcript_exon_variant,Transcript,processed_pseudogene,1/1,,,,
