# Gene analysis, Drugs - gget opentargets

In [1]:
# try:
#     import varseek as vk
# except ImportError:
#     print("varseek not found, installing...")
#     !pip install -U -q varseek
# try:
#     import RLSRWP_2025
# except ImportError:
#     print("RLSRWP_2025 not found, installing...")
#     !pip install -q git+https://github.com/pachterlab/RLSRWP_2025.git
# try:
#     import gget
# except ImportError:
#     print("gget not found, installing...")
#     !pip install -U -q gget

In [1]:
import os
import anndata as ad
import pandas as pd
import gget
import subprocess

from varseek.utils import convert_mutation_cds_locations_to_cdna, get_ensembl_gene_id_bulk

RLSRWP_2025_dir = os.path.dirname(os.path.abspath(""))  # if this notebook resides in RLSRWP_2025/notebooks/0_data_download.ipynb, then this retrieves RLSRWP_2025

In [2]:
n = 3
sorting_column = "vcrs_count"  # "vcrs_count" (sort by variant), "gene_count" (sort by gene), etc

vk_count_out_dir = os.path.join(RLSRWP_2025_dir, "data", "vk_count_out")
adata_path = os.path.join(vk_count_out_dir, "adata_cleaned.h5ad")

out_dir = os.path.join(vk_count_out_dir, "analysis", "gget_opentargets")

reference_dir = os.path.join(RLSRWP_2025_dir, "data", "reference")
cosmic_csv = os.path.join(reference_dir, "cosmic", "CancerMutationCensus_AllData_Tsv_v101_GRCh37", "CancerMutationCensus_AllData_v101_GRCh37_mutation_workflow.csv")
sequences_cds = os.path.join(reference_dir, "ensembl_grch37_release93", "Homo_sapiens.GRCh37.cds.all.fa")
sequences_cdna = os.path.join(reference_dir, "ensembl_grch37_release93", "Homo_sapiens.GRCh37.cdna.all.fa")

In [4]:
os.makedirs(out_dir, exist_ok=True)
adata = ad.read_h5ad(adata_path)

# For simplicity, filter out all mutants with ambiguous identity
adata = adata[:, adata.var[~adata.var['vcrs_header'].str.contains(';', na=False)].index].copy()

adata.var.head()

Unnamed: 0,vcrs_header,vcrs_count,gene_name
0,ENST00000391429:c.194C>T,0.0,BHLHA9
1,ENST00000391429:c.128C>A,0.0,BHLHA9
2,ENST00000391429:c.257C>A,0.0,BHLHA9
3,ENST00000391429:c.100G>A,0.0,BHLHA9
4,ENST00000391429:c.99G>A,0.0,BHLHA9


In [None]:
adata_var_columns = adata.var.columns

# download cosmic and cdna
if not os.path.exists(sequences_cdna):
    print("Downloading cDNA")
    reference_cdna_dir = os.path.dirname(sequences_cdna) if os.path.dirname(sequences_cdna) else "."
    gget_ref_command = ["gget", "ref", "-w", "cdna", "-r", "93", "--out_dir", reference_cdna_dir, "-d", "human_grch37"]
    subprocess.run(gget_ref_command, check=True)
    subprocess.run(["gunzip", f"{sequences_cdna}.gz"], check=True)
if not os.path.exists(sequences_cds):
    print("Downloading CDS")
    reference_cds_dir = os.path.dirname(sequences_cds) if os.path.dirname(sequences_cds) else "."
    gget_ref_command = ["gget", "ref", "-w", "cds", "-r", "93", "--out_dir", reference_cds_dir, "-d", "human_grch37"]
    subprocess.run(gget_ref_command, check=True)
    subprocess.run(["gunzip", f"{sequences_cds}.gz"], check=True)

if not os.path.exists(cosmic_csv):
    print("Downloading COSMIC")
    reference_out_dir_cosmic = os.path.dirname(os.path.dirname(cosmic_csv))
    gget.cosmic(
        None,
        grch_version=37,
        cosmic_version=101,
        out=reference_out_dir_cosmic,
        mutation_class="cancer",
        download_cosmic=True,
        gget_mutate=True,
    )

cosmic_df = pd.read_csv(cosmic_csv, nrows=2)

if "mutation_cdna" not in cosmic_df.columns:
    print("Converting CDS to cDNA in COSMIC")
    _, _ = convert_mutation_cds_locations_to_cdna(input_csv_path=cosmic_csv, output_csv_path=cosmic_csv, cds_fasta_path=sequences_cds, cdna_fasta_path=sequences_cdna, verbose=True, strip_leading_Ns_cds=True)

cosmic_df = pd.read_csv(cosmic_csv, usecols=["seq_ID", "mutation_cdna", "gene_name"])
cosmic_df["vcrs_header"] = cosmic_df["seq_ID"] + ":" + cosmic_df["mutation_cdna"]

if "gene_name" not in adata.var.columns:
    adata.var = adata.var.merge(cosmic_df[["vcrs_header", "gene_name"]], on="vcrs_header", how="left")

if sorting_column == "gene_count" and "gene_count" not in adata.var.columns:
    filtered_var = adata.var[adata.var["vcrs_count"] > 0]
    gene_counts = filtered_var["gene_name"].value_counts()
    adata.var["gene_count"] = adata.var["gene_name"].map(gene_counts).fillna(0).astype(int)
    del filtered_var

if set(adata_var_columns) != set(adata.var.columns):
    adata.write_h5ad(adata_path)

In [9]:
adata.var = adata.var.sort_values(by=sorting_column, ascending=False)
adata.var.head()

Unnamed: 0,vcrs_header,vcrs_count,gene_name
691684,ENST00000362079:c.760G>A,10166.0,MT-CO3
2675243,ENST00000393099:c.583G>A,7259.0,RPL13
2021384,ENST00000251453:c.455C>T,5370.0,RPS16
4543187,ENST00000423316:c.2076C>T,4317.0,EEF1D
2685771,ENST00000593646:c.53G>A,2736.0,AC040977.1


In [10]:
top_variants = adata.var["vcrs_header"][:n].tolist()
top_transcripts = [header.split(":")[0] for header in top_variants]
top_transcripts = list(dict.fromkeys(top_transcripts))  # remove duplicates while preserving order
print(top_transcripts)

['ENST00000362079', 'ENST00000393099', 'ENST00000251453']


In [None]:
transcript_to_gene_mapping = get_ensembl_gene_id_bulk(top_transcripts, species="human", reference_version="grch37")
top_genes = [transcript_to_gene_mapping[transcript] for transcript in top_transcripts]
print(top_genes)

['ENSG00000198938', 'ENSG00000167526', 'ENSG00000105193']


## Open Targets

In [16]:
for idx, gene in enumerate(top_genes):
    df = gget.opentargets(gene, resource = 'drugs')
    if not df.empty:
        df.to_csv(os.path.join(out_dir, f'{idx}_{gene}_opentargets.csv'))
        print(f"Saved {gene} to {out_dir}")
    else:
        print(f"No data for {gene}")

19:52:09 - INFO - Retrieved 0/0 known drugs. (Querying count, will fetch all results next.)
19:52:09 - INFO - Retrieved 0/0 known drugs.


No data for ENSG00000198938


19:52:09 - INFO - Retrieved 1/31 known drugs. (Querying count, will fetch all results next.)
19:52:10 - INFO - Retrieved 31/31 known drugs.


Saved ENSG00000167526 to /Users/joeyrich/Desktop/local/RLSRWP_2025/data/vk_count_out/analysis/gget_opentargets


19:52:10 - INFO - Retrieved 1/31 known drugs. (Querying count, will fetch all results next.)
19:52:11 - INFO - Retrieved 31/31 known drugs.


Saved ENSG00000105193 to /Users/joeyrich/Desktop/local/RLSRWP_2025/data/vk_count_out/analysis/gget_opentargets
