Init variables

In [21]:
data_root = "../data"

clin_var = f"{data_root}/clinvar.vcf.gz"
brca = f"{data_root}/brca.vcf"
targeted_vcf = f"{data_root}/targeted.vcf"


In [26]:
import requests
import os

# URL for the ClinVar VCF file for the BRCA1 and BRCA2 genes
url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz"

if not os.path.isfile(clin_var):
    # Send a GET request to the URL
    response = requests.get(url)
    # Save the response content as a .vcf.gz file
    with open(clin_var, 'wb') as f:
        f.write(response.content)

Extract gene related data

In [27]:
import pysam
import os

# Index the ClinVar VCF file
if not os.path.isfile(f"{clin_var}.tbi"):
    pysam.tabix_index(clin_var, preset="vcf")

# Open the ClinVar VCF file
vcf = pysam.VariantFile(clin_var)
print(vcf.header.info.keys())
# Open an output VCF file
out = pysam.VariantFile(brca, 'w', header=vcf.header)

# Loop over the records in the VCF file
for record in vcf:
    if not 'GENEINFO' in record.info.keys():
        continue
    # Check if the record is for the BRCA1 or BRCA2 gene
    if 'BRCA1' in record.info['GENEINFO'] or 'BRCA2' in record.info['GENEINFO']:
        # Write the record to the output file
        out.write(record)

# Close the VCF files
vcf.close()
out.close()


['AF_ESP', 'AF_EXAC', 'AF_TGP', 'ALLELEID', 'CLNDN', 'CLNDNINCL', 'CLNDISDB', 'CLNDISDBINCL', 'CLNHGVS', 'CLNREVSTAT', 'CLNSIG', 'CLNSIGCONF', 'CLNSIGINCL', 'CLNVC', 'CLNVCSO', 'CLNVI', 'DBVARID', 'GENEINFO', 'MC', 'ORIGIN', 'RS']


Get cancer related genes

In [43]:
import pysam

# Open the patient's VCF file and the BRCA VCF file from ClinVar
vcf_patient = pysam.VariantFile(targeted_vcf)
vcf_brca = pysam.VariantFile(brca)

# Create a set to store cancer-associated mutations
cancer_mutations = set()
cancer_mutations_map = {}

# Iterate over variants in the BRCA VCF file and add cancer-associated mutations to the 'cancer_mutations' set
for record in vcf_brca:
    clnsig = record.info.get('CLNSIG')
    mutation_type = record.info.get('CLNVC')
    classification = clnsig[0].split(':')[0] if clnsig else None
    cancer_mutations.add((record.chrom, record.pos, record.ref, record.alts))
    cancer_mutations_map[f"{record.chrom}_{record.pos}"] = f"{mutation_type}, {classification}"

# Iterate over variants in the patient's VCF file and check for cancer-associated mutations
for record in vcf_patient:
    row = record.chrom.replace('chr', ''), record.pos, record.ref, record.alts
    if row in cancer_mutations:
        print(cancer_mutations_map[f"{record.chrom.replace('chr', '')}_{record.pos}"], "detected:", record.chrom, record.pos, record.ref, record.alts)


Deletion, Benign detected: chr13 32900363 CT ('C',)
Deletion, Benign/Likely_benign detected: chr13 32903565 AT ('A',)
Deletion, Likely_benign detected: chr13 32905046 AT ('A',)
Deletion, Pathogenic detected: chr13 32905069 AT ('A',)
Deletion, Pathogenic detected: chr13 32905097 GA ('G',)
Deletion, Pathogenic detected: chr13 32906602 GA ('G',)
Deletion, Pathogenic detected: chr13 32906663 GA ('G',)
Deletion, Pathogenic detected: chr13 32906694 CA ('C',)
Indel, Pathogenic detected: chr13 32906729 A ('G',)
single_nucleotide_variant, Conflicting_interpretations_of_pathogenicity detected: chr13 32906733 A ('G',)
Deletion, Pathogenic detected: chr13 32907171 GT ('G',)
Deletion, Pathogenic detected: chr13 32907420 GA ('G',)
Deletion, Pathogenic detected: chr13 32911073 CA ('C',)
single_nucleotide_variant, Benign detected: chr13 32911888 A ('G',)
Deletion, Pathogenic detected: chr13 32912345 GA ('G',)
Deletion, Pathogenic detected: chr13 32912770 AT ('A',)
Indel, Conflicting_interpretations_of