# Variant analysis, OncoKB-Annotator

In [None]:
# try:
#     import varseek as vk
# except ImportError:
#     print("varseek not found, installing...")
#     !pip install -U -q varseek
# try:
#     import RLSRWP_2025
# except ImportError:
#     print("RLSRWP_2025 not found, installing...")
#     !pip install -q git+https://github.com/pachterlab/RLSRWP_2025.git

# !git clone https://github.com/oncokb/oncokb-annotator.git && cd oncokb-annotator && pip install -r requirements/common.txt -r requirements/pip3.txt

In [1]:
import os
import anndata as ad
import pandas as pd
import gget
import subprocess

from varseek.utils import download_box_url  # for functions used in both varseek and here

RLSRWP_2025_dir = os.path.dirname(os.path.abspath(""))  # if this notebook resides in RLSRWP_2025/notebooks/0_data_download.ipynb, then this retrieves RLSRWP_2025

In [4]:
oncokb_annotator_dir = os.path.join(os.path.dirname(RLSRWP_2025_dir), "oncokb-annotator")

vk_count_out_dir = os.path.join(RLSRWP_2025_dir, "data", "vk_count_out_fig1")
# maf_path = os.path.join(vk_count_out_dir, "variants.maf")  #!!! uncomment
maf_path = "/Users/joeyrich/Desktop/local/varseek/trash/variants.maf"  #!!! erase
output_maf_path = maf_path.replace(".maf", ".oncokb.maf")

out_dir = os.path.join(vk_count_out_dir, "analysis", "oncokb_annotator")

oncokb_api_key = os.getenv("ONCOKB_API_KEY")

### Download the MAF file

In [5]:
if not os.path.exists(maf_path):
    maf_url = ""  #!!!
    download_box_url(maf_url, output_file_name=maf_path)

### Run the OncoKB Annotator

In [6]:
if oncokb_api_key is None:
    raise ValueError("Please set the ONCOKB_API_KEY environment variable, or define the oncokb_api_key variable.")

In [8]:
!python {oncokb_annotator_dir}/MafAnnotator.py -i {maf_path} -o {output_maf_path} -b {oncokb_api_key} -r grch37

INFO:MafAnnotator:annotating /Users/joeyrich/Desktop/local/varseek/trash/variants.maf ...
INFO:AnnotatorCore:Your OncoKB API token is valid and will expire on 2025-05-30 21:51:25 UTC
INFO:AnnotatorCore:Cancer type for the sample should be defined for a more accurate result. 	Line 1
INFO:AnnotatorCore:Cancer type for the sample should be defined for a more accurate result. 	Line 2
INFO:AnnotatorCore:Cancer type for the sample should be defined for a more accurate result. 	Line 3
INFO:AnnotatorCore:Cancer type for the sample should be defined for a more accurate result. 	Line 4
INFO:AnnotatorCore:Cancer type for the sample should be defined for a more accurate result. 	Line 5
INFO:AnnotatorCore:Cancer type for the sample should be defined for a more accurate result. 	Line 6
INFO:AnnotatorCore:Cancer type for the sample should be defined for a more accurate result. 	Line 7
INFO:AnnotatorCore:Cancer type for the sample should be defined for a more accurate result. 	Line 8
INFO:AnnotatorCor

### Run Funcotator

In [None]:
gatk = "/home/jmrich/opt/gatk-4.6.0.0/gatk"

info_fields = ["NS", "AO"]
# vcf_path = os.path.join(vk_count_out_dir, "variants.vcf")  #!!! uncomment
vcf_path = "/Users/joeyrich/Desktop/local/varseek/variants.vcf"  #!!! erase
vcf_path_output = vcf_path.replace(".vcf", ".funcotated.vcf")
table_path_output = vcf_path.replace(".vcf", ".funcotated.table")
funcotator_columns_file = table_path_output.replace(".table", "_columns.txt")

reference_genome = os.path.join(RLSRWP_2025_dir, "data", "reference", "ensembl_grch37_release93", "Homo_sapiens.GRCh37.dna.primary_assembly.fa")
data_source = "somatic"  # somatic or germline
data_sources_path = os.path.join(RLSRWP_2025_dir, "data", "reference", f"gatk_data_sources_{data_source}", f"funcotator_dataSources.v1.8.hg19.20230908{data_source[0]}")

In [None]:
if not os.path.exists(vcf_path):
    vcf_url = ""  #!!!
    download_box_url(vcf_url, output_file_name=vcf_path)

In [None]:
if not os.path.exists(data_sources_path):
    if os.path.dirname(data_sources_path):
        os.makedirs(os.path.dirname(data_sources_path), exist_ok=True)
    !$gatk FuncotatorDataSourceDownloader --{data_source} --hg19 --validate-integrity --extract-after-download -O {data_sources_path}.tar.gz

In [None]:
# Annotate using Funcotator
#* output-file-format can be VCF or MAF
!$gatk Funcotator \
    --variant {vcf_url} \
    --reference {reference_genome} \
    --ref-version hg19 \
    --data-sources-path {data_sources_path} \
    --output {vcf_path_output} \
    --output-file-format VCF \
    --disable-sequence-dictionary-validation

### Convert to table

In [None]:
variants_to_table_command = f"{gatk} VariantsToTable -V {vcf_path_output} -O {table_path_output} "
for field in info_fields:
    variants_to_table_command += f"-F {field} "
variants_to_table_command += "-F FUNCOTATION"

!{variants_to_table_command}

### Clean up column formatting

In [None]:
!cat {vcf_path_output} | grep " Funcotation fields are: " | sed 's/|/\t/g' > {funcotator_columns_file}
!cat {table_path_output} | cut -f {number_of_info_fields+1} | sed 's/|/\t/g' >> {funcotator_columns_file}