# Mutation analysis, Protein Mutation - gget elm

In [8]:
# try:
#     import varseek as vk
# except ImportError:
#     print("varseek not found, installing...")
#     !pip install -U -q varseek
# try:
#     import RLSRWP_2025
# except ImportError:
#     print("RLSRWP_2025 not found, installing...")
#     !pip install -q git+https://github.com/pachterlab/RLSRWP_2025.git
# try:
#     import gget
# except ImportError:
#     print("gget not found, installing...")
#     !pip install -U -q gget

In [None]:
import os
import subprocess
import shutil
import anndata as ad
import pandas as pd
import gget
import varseek as vk
from varseek.utils import convert_mutation_cds_locations_to_cdna

RLSRWP_2025_dir = os.path.dirname(os.path.abspath(""))  # if this notebook resides in RLSRWP_2025/notebooks/0_data_download.ipynb, then this retrieves RLSRWP_2025

### File path definitions and imports

In [10]:
n = 3

vk_count_out_dir = os.path.join(RLSRWP_2025_dir, "data", "vk_count_out")
adata_path = os.path.join(vk_count_out_dir, "adata_cleaned.h5ad")

out_dir = os.path.join(vk_count_out_dir, "analysis", "gget_elm")

reference_dir = os.path.join(RLSRWP_2025_dir, "data", "reference")
cosmic_csv = os.path.join(reference_dir, "cosmic", "CancerMutationCensus_AllData_Tsv_v101_GRCh37", "CancerMutationCensus_AllData_v101_GRCh37_mutation_workflow.csv")
sequences_cds = os.path.join(reference_dir, "ensembl_grch37_release93", "Homo_sapiens.GRCh37.cds.all.fa")
sequences_cdna = os.path.join(reference_dir, "ensembl_grch37_release93", "Homo_sapiens.GRCh37.cdna.all.fa")

### Reference file downloads

In [37]:
# download cosmic and cdna
if not os.path.exists(sequences_cdna):
    print("Downloading cDNA")
    reference_cdna_dir = os.path.dirname(sequences_cdna) if os.path.dirname(sequences_cdna) else "."
    gget_ref_command = ["gget", "ref", "-w", "cdna", "-r", "93", "--out_dir", reference_cdna_dir, "-d", "human_grch37"]
    subprocess.run(gget_ref_command, check=True)
    subprocess.run(["gunzip", f"{sequences_cdna}.gz"], check=True)
if not os.path.exists(sequences_cds):
    print("Downloading CDS")
    reference_cds_dir = os.path.dirname(sequences_cds) if os.path.dirname(sequences_cds) else "."
    gget_ref_command = ["gget", "ref", "-w", "cds", "-r", "93", "--out_dir", reference_cds_dir, "-d", "human_grch37"]
    subprocess.run(gget_ref_command, check=True)
    subprocess.run(["gunzip", f"{sequences_cds}.gz"], check=True)

if not os.path.exists(cosmic_csv):
    print("Downloading COSMIC")
    reference_out_dir_cosmic = os.path.dirname(os.path.dirname(cosmic_csv))
    gget.cosmic(
        None,
        grch_version=37,
        cosmic_version=101,
        out=reference_out_dir_cosmic,
        mutation_class="cancer",
        download_cosmic=True,
    )

cosmic_df = pd.read_csv(cosmic_csv, nrows=2)

if "mutation_cdna" not in cosmic_df.columns:
    print("Converting CDS to cDNA in COSMIC")
    _, _ = convert_mutation_cds_locations_to_cdna(input_csv_path=cosmic_csv, output_csv_path=cosmic_csv, cds_fasta_path=sequences_cds, cdna_fasta_path=sequences_cdna, verbose=True)

cosmic_df_cols = ["mutation_aa", "mutation", "mutation_cdna", "seq_ID"]
if "vcrs_header_cdna" in cosmic_df.columns:
    cosmic_df_cols.append("vcrs_header_cdna")
if "vcrs_header_cds" in cosmic_df.columns:
    cosmic_df_cols.append("vcrs_header_cds")

cosmic_df = pd.read_csv(cosmic_csv, usecols=cosmic_df_cols)

if "vcrs_header_cdna" not in cosmic_df.columns:
    cosmic_df["vcrs_header_cdna"] = cosmic_df["seq_ID"] + ":" + cosmic_df["mutation_cdna"]
if "vcrs_header_cds" not in cosmic_df.columns:
    cosmic_df["vcrs_header_cds"] = cosmic_df["seq_ID"] + ":" + cosmic_df["mutation"]
    # cosmic_df.to_csv(cosmic_csv, index=False)

### Load in adata.var and merge in cosmic data

In [38]:
adata = ad.read_h5ad(adata_path)

# For simplicity, filter out all mutants with ambiguous identity
adata.var.rename(columns={"vcrs_header": "vcrs_header_cdna"}, inplace=True)
adata = adata[:, adata.var[~adata.var['vcrs_header_cdna'].str.contains(';', na=False)].index].copy()

# keep only the top n variants
top_vcrs = adata.var["vcrs_count"].nlargest(n).index
adata = adata[:, top_vcrs].copy()

columns_to_merge = []
for column in ["mutation_aa", "mutation", "seq_ID"]:
    if column not in adata.var.columns:
        columns_to_merge.append(column)

if columns_to_merge:
    columns_to_merge.extend(["vcrs_header_cdna", "vcrs_header_cds"])
    adata.var = adata.var.merge(cosmic_df[columns_to_merge], on="vcrs_header_cdna", how="left", suffixes=("_original", ""))

adata.var.head()

Unnamed: 0,vcrs_header_cdna,vcrs_count,mutation_aa,mutation,seq_ID,vcrs_header_cds
0,ENST00000362079:c.760G>A,10166.0,p.V254I,c.760G>A,ENST00000362079,ENST00000362079:c.760G>A
1,ENST00000393099:c.583G>A,7259.0,p.A112T,c.334G>A,ENST00000393099,ENST00000393099:c.334G>A
2,ENST00000251453:c.455C>T,5370.0,p.G134=,c.402C>T,ENST00000251453,ENST00000251453:c.402C>T


### If adata.var doesn't have the full sequences with the variants added, then make them with vk build

In [None]:
if "vcrs_sequence_full" not in adata.var:
    out_dir_vk_build = os.path.join(vk_count_out_dir, "vk_build_for_gget_elm")
    variants_updated_csv = os.path.join(out_dir_vk_build, "variants_updated.csv")

    if not os.path.exists(variants_updated_csv):
        vk.build(
            variants = adata.var[["seq_ID", "mutation"]],
            sequences=sequences_cds,
            seq_id_column="seq_ID",
            var_column="mutation",
            out=out_dir_vk_build,
            save_variants_updated_csv=True,
            store_full_sequences=True,
            translate=True,
            merge_identical=False,
            use_IDs=False,
            remove_seqs_with_wt_kmers=False,
            optimize_flanking_regions=False,
            min_seq_len=None,
            overwrite=True
        )

    variants_updated_df = pd.read_csv(variants_updated_csv, usecols=["vcrs_header", "wt_sequence_aa_full", "vcrs_sequence_aa_full"])
    variants_updated_df.rename(columns={"vcrs_header": "vcrs_header_cds", "vcrs_sequence_aa_full": "variant_sequence_aa_full"}, inplace=True)

    # keep sequence up until first stop codon (stop codon not recognized by ELM)
    variants_updated_df["wt_sequence_aa_until_first_stop_codon"] = variants_updated_df["wt_sequence_aa_full"].str.split("*").str[0]
    variants_updated_df["variant_sequence_aa_until_first_stop_codon"] = variants_updated_df["variant_sequence_aa_full"].str.split("*").str[0]

    adata.var = adata.var.merge(variants_updated_df, on="vcrs_header_cds", how="left")

In [41]:
adata.var.head()

Unnamed: 0,vcrs_header_cdna,vcrs_count,mutation_aa,mutation,seq_ID,vcrs_header_cds,wt_sequence_aa_full,variant_sequence_aa_full,wt_sequence_aa_until_first_stop_codon,variant_sequence_aa_until_first_stop_codon
0,ENST00000362079:c.760G>A,10166.0,p.V254I,c.760G>A,ENST00000362079,ENST00000362079:c.760G>A,MTHQSHAYHIVKPSP*PLTGALSALLMTSGLAM*FHFHSITLLILG...,MTHQSHAYHIVKPSP*PLTGALSALLMTSGLAM*FHFHSITLLILG...,MTHQSHAYHIVKPSP,MTHQSHAYHIVKPSP
1,ENST00000393099:c.583G>A,7259.0,p.A112T,c.334G>A,ENST00000393099,ENST00000393099:c.334G>A,MAPSRNGMVLKPHFHKDWQRRVATWFNQPARKIRRRKARQAKARRI...,MAPSRNGMVLKPHFHKDWQRRVATWFNQPARKIRRRKARQAKARRI...,MAPSRNGMVLKPHFHKDWQRRVATWFNQPARKIRRRKARQAKARRI...,MAPSRNGMVLKPHFHKDWQRRVATWFNQPARKIRRRKARQAKARRI...
2,ENST00000251453:c.455C>T,5370.0,p.G134=,c.402C>T,ENST00000251453,ENST00000251453:c.402C>T,MPSKGPLQSVQVFGRKKTATAVAHCKRGNGLIKVNGRPLEMIEPRT...,MPSKGPLQSVQVFGRKKTATAVAHCKRGNGLIKVNGRPLEMIEPRT...,MPSKGPLQSVQVFGRKKTATAVAHCKRGNGLIKVNGRPLEMIEPRT...,MPSKGPLQSVQVFGRKKTATAVAHCKRGNGLIKVNGRPLEMIEPRT...


In [43]:
try:
    gget.elm("TEST")
except FileNotFoundError:
    gget.setup('elm')

11:13:53 - INFO - ELM_Classes_Download_Version: 1.4, ELM_Classes_Download_Date: 2025-03-29 00:57:33.973942
11:13:53 - INFO - ELM_Instance_Download_Version: 1.4, ELM_Instance_Download_Date: 2025-03-29 02:55:38.424686
11:13:53 - INFO - ORTHO Compiling ortholog information...
11:13:53 - INFO - ORTHO Performing pairwise sequence alignment against ELM database using DIAMOND for 1 sequence(s)...
11:13:53 - INFO - Creating DIAMOND database and initiating alignment...


diamond version 2.1.8


11:13:54 - INFO - DIAMOND alignment complete.
11:13:54 - INFO - REGEX Finding regex motif matches...


In [45]:
for index, row in adata.var.iterrows():
    header = row['vcrs_header_cdna']
    aa_sequence_wt = row['wt_sequence_aa_until_first_stop_codon']
    aa_sequence_mutant = row['variant_sequence_aa_until_first_stop_codon']
    # !gget elm {aa_sequence} --out {output_data_dir}/{index}_{header}_elm.txt
    gget.elm(aa_sequence_wt, out = os.path.join(out_dir, f'{index}_{header}_wt_elm'))
    gget.elm(aa_sequence_mutant, out = os.path.join(out_dir, f'{index}_{header}_mutant_elm'))
    print(f"Saved {header} to {out_dir}")

11:14:14 - INFO - ELM_Classes_Download_Version: 1.4, ELM_Classes_Download_Date: 2025-03-29 00:57:33.973942
11:14:14 - INFO - ELM_Instance_Download_Version: 1.4, ELM_Instance_Download_Date: 2025-03-29 02:55:38.424686
11:14:14 - INFO - ORTHO Compiling ortholog information...
11:14:14 - INFO - ORTHO Performing pairwise sequence alignment against ELM database using DIAMOND for 1 sequence(s)...
11:14:14 - INFO - Creating DIAMOND database and initiating alignment...


diamond version 2.1.8


11:14:15 - INFO - DIAMOND alignment complete.
11:14:15 - INFO - REGEX Finding regex motif matches...
11:14:15 - INFO - ELM_Classes_Download_Version: 1.4, ELM_Classes_Download_Date: 2025-03-29 00:57:33.973942
11:14:15 - INFO - ELM_Instance_Download_Version: 1.4, ELM_Instance_Download_Date: 2025-03-29 02:55:38.424686
11:14:15 - INFO - ORTHO Compiling ortholog information...
11:14:15 - INFO - ORTHO Performing pairwise sequence alignment against ELM database using DIAMOND for 1 sequence(s)...
11:14:15 - INFO - Creating DIAMOND database and initiating alignment...


diamond version 2.1.8


11:14:16 - INFO - DIAMOND alignment complete.
11:14:16 - INFO - REGEX Finding regex motif matches...
11:14:16 - INFO - ELM_Classes_Download_Version: 1.4, ELM_Classes_Download_Date: 2025-03-29 00:57:33.973942
11:14:16 - INFO - ELM_Instance_Download_Version: 1.4, ELM_Instance_Download_Date: 2025-03-29 02:55:38.424686
11:14:16 - INFO - ORTHO Compiling ortholog information...
11:14:16 - INFO - ORTHO Performing pairwise sequence alignment against ELM database using DIAMOND for 1 sequence(s)...
11:14:16 - INFO - Creating DIAMOND database and initiating alignment...


Saved ENST00000362079:c.760G>A to /Users/joeyrich/Desktop/local/RLSRWP_2025/data/vk_count_out/analysis/gget_elm
diamond version 2.1.8


11:14:17 - INFO - DIAMOND alignment complete.
11:14:17 - INFO - REGEX Finding regex motif matches...
11:14:17 - INFO - ELM_Classes_Download_Version: 1.4, ELM_Classes_Download_Date: 2025-03-29 00:57:33.973942
11:14:17 - INFO - ELM_Instance_Download_Version: 1.4, ELM_Instance_Download_Date: 2025-03-29 02:55:38.424686
11:14:17 - INFO - ORTHO Compiling ortholog information...
11:14:17 - INFO - ORTHO Performing pairwise sequence alignment against ELM database using DIAMOND for 1 sequence(s)...
11:14:17 - INFO - Creating DIAMOND database and initiating alignment...


diamond version 2.1.8


11:14:18 - INFO - DIAMOND alignment complete.
11:14:18 - INFO - REGEX Finding regex motif matches...
11:14:18 - INFO - ELM_Classes_Download_Version: 1.4, ELM_Classes_Download_Date: 2025-03-29 00:57:33.973942
11:14:18 - INFO - ELM_Instance_Download_Version: 1.4, ELM_Instance_Download_Date: 2025-03-29 02:55:38.424686
11:14:18 - INFO - ORTHO Compiling ortholog information...
11:14:18 - INFO - ORTHO Performing pairwise sequence alignment against ELM database using DIAMOND for 1 sequence(s)...
11:14:18 - INFO - Creating DIAMOND database and initiating alignment...


Saved ENST00000393099:c.583G>A to /Users/joeyrich/Desktop/local/RLSRWP_2025/data/vk_count_out/analysis/gget_elm
diamond version 2.1.8


11:14:19 - INFO - DIAMOND alignment complete.
11:14:19 - INFO - REGEX Finding regex motif matches...
11:14:20 - INFO - ELM_Classes_Download_Version: 1.4, ELM_Classes_Download_Date: 2025-03-29 00:57:33.973942
11:14:20 - INFO - ELM_Instance_Download_Version: 1.4, ELM_Instance_Download_Date: 2025-03-29 02:55:38.424686
11:14:20 - INFO - ORTHO Compiling ortholog information...
11:14:20 - INFO - ORTHO Performing pairwise sequence alignment against ELM database using DIAMOND for 1 sequence(s)...
11:14:20 - INFO - Creating DIAMOND database and initiating alignment...


diamond version 2.1.8


11:14:20 - INFO - DIAMOND alignment complete.
11:14:20 - INFO - REGEX Finding regex motif matches...


Saved ENST00000251453:c.455C>T to /Users/joeyrich/Desktop/local/RLSRWP_2025/data/vk_count_out/analysis/gget_elm
