In [None]:
import pandas as pd
import pyfastx
from tqdm import tqdm
from varseek.utils import convert_mutation_cds_locations_to_cdna, convert_mutation_cds_locations_to_cdna_old, count_leading_Ns

input_df_path = "/Users/joeyrich/Desktop/local/varseek/data/reference/cosmic/CancerMutationCensus_AllData_Tsv_v101_GRCh37/CancerMutationCensus_AllData_v101_GRCh37_mutation_workflow.csv"
cdna_fasta_path = "/Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.cdna.all.fa"
cds_fasta_path = "/Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.cds.all.fa"

cdna = pyfastx.Fastx(cdna_fasta_path)
cds = pyfastx.Fastx(cds_fasta_path)

input_df = pd.read_csv(input_df_path, usecols=["seq_ID", "mutation"])
output_df, _ = convert_mutation_cds_locations_to_cdna(input_df, cdna_fasta_path, cds_fasta_path, output_csv_path=None, verbose=True, strip_leading_Ns_cds=False)
output_df_old, _ = convert_mutation_cds_locations_to_cdna(input_df, cdna_fasta_path, cds_fasta_path, output_csv_path=None, verbose=True, strip_leading_Ns_cds=True)

output_df = output_df.merge(output_df_old, on=["seq_ID", "mutation"], suffixes=["_new", "_old"])  # new has Ns, old has no Ns (ie stripped off)

20:43:18 - INFO - Copying df internally to avoid in-place modifications
20:43:18 - INFO - Removing unknown mutations
20:43:22 - INFO - Removing unsupported mutation types
20:43:22 - INFO - Uncertain mutations: 0
20:43:22 - INFO - Ambiguous position mutations: 0
20:43:22 - INFO - Intronic mutations: 368
20:43:22 - INFO - Posttranslational region mutations: 0
20:43:22 - INFO - Sorting df
20:43:25 - INFO - Determining mutation positions
20:43:37 - INFO - Removing version numbers in in fasta headers for /Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.cdna.all.fa
20:43:38 - INFO - Building pyfastx index for /Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.cdna.all.fa
20:43:38 - INFO - Removing version numbers in in fasta headers for /Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.cds.all.fa
20:43:38 - INFO - Building pyfastx index for /Users/jo

In [2]:
output_df_cosmic = output_df.copy()
output_df_old_cosmic = output_df_old.copy()

In [24]:
output_df = output_df.merge(output_df_old, on=["seq_ID", "mutation"], suffixes=["_new", "_old"])  # new has Ns, old has no Ns (ie stripped off)

In [25]:
output_df.head()

Unnamed: 0,seq_ID,mutation,mutation_cdna_new,mutation_cdna_old,mutation_cdna
0,ENST00000396153,c.1468C>T,c.1920C>T,c.1920C>T,c.1920C>T
1,ENST00000445907,c.162C>A,c.690C>A,c.690C>A,c.690C>A
2,ENST00000445907,c.617A>T,c.1145A>T,c.1145A>T,c.1145A>T
3,ENST00000445907,c.93A>T,c.621A>T,c.621A>T,c.621A>T
4,ENST00000445907,c.956A>C,c.1484A>C,c.1484A>C,c.1484A>C


In [27]:
output_df_filtered = output_df.loc[output_df["mutation_cdna_new"] != output_df["mutation_cdna_old"]]
# output_df_filtered = output_df_filtered.loc[(~output_df_filtered["mutation_cdna_new"].isna()) & (~output_df_filtered["mutation_cdna_old"].isna())]
# output_df_filtered = output_df_filtered.loc[output_df_filtered["mutation_cdna_new"].str.contains(">")]  # keep only substitutions
# output_df_filtered[['cds_pos', 'ref_base_cds']] = output_df_filtered['mutation'].str.extract(r'c\.(\d+)([ACGT])')
# output_df_filtered[['cdna_pos_new', 'ref_base_new_irrelevant']] = output_df_filtered['mutation_cdna_new'].str.extract(r'c\.(\d+)([ACGT])')
# output_df_filtered[['cdna_pos_old', 'ref_base_old_irrelevant']] = output_df_filtered['mutation_cdna_old'].str.extract(r'c\.(\d+)([ACGT])')
# output_df_filtered.drop(columns=["ref_base_new_irrelevant", "ref_base_old_irrelevant"], inplace=True)
output_df_filtered

Unnamed: 0,seq_ID,mutation,mutation_cdna_new,mutation_cdna_old,mutation_cdna
6113,ENST00000403951,c.1437-1_1437insAATAATAA,,,
91550,ENST00000264029,c.1705-1_1705insC,,,
116766,ENST00000243077,c.12439_12439+1insTGACTAACCCCT,,,
126093,ENST00000323301,c.129_129+1insTTTAC,,,
130508,ENST00000371372,c.284_284+1insTCCATTCGAGTATATGATCCTGGCCACCATC,,,
...,...,...,...,...,...
5419170,ENST00000318967,c.2822-1dup,,,
5419200,ENST00000457052,c.1-31_17dup,,,
5419215,ENST00000265140,c.2420-1_2420dup,,,
5419230,ENST00000371953,c.80-8_92dup,,,


In [16]:
output_df_filtered.head(2)

Unnamed: 0,seq_ID,mutation,mutation_cdna_new,mutation_cdna_old,mutation_cdna,cds_pos,ref_base_cds,cdna_pos_new,cdna_pos_old


In [5]:
stripped_cds_victory_count = 0
raw_cds_victory_count = 0
both_cds_lose = 0
new_cdna_victory_count = 0
old_cdna_victory_count = 0
both_cdna_lose = 0

for row in tqdm(output_df_filtered.itertuples(), total=len(output_df_filtered)):
    try:
        selected_transcript = row.seq_ID
        selected_hgvsc = f"{row.seq_ID}:{row.mutation}"
        selected_position_new_cdna = int(row.cdna_pos_new) - 1
        selected_position_old_cdna = int(row.cdna_pos_old) - 1
        selected_cds_pos_withNs = int(row.cds_pos) - 1
        selected_ref_base_cds = row.ref_base_cds
        
        selected_sequence_cdna, selected_sequence_cds = None, None

        for name, seq in cdna:
            if selected_transcript in name:
                selected_sequence_cdna = str(seq)
                break

        for name, seq in cds:
            if selected_transcript in name:
                selected_sequence_cds = str(seq)
                break
        
        selected_base_cds_withNs = selected_sequence_cds[selected_cds_pos_withNs]
        cds_Ns = count_leading_Ns(selected_sequence_cds)
        selected_cds_pos_withoutNs = selected_cds_pos_withNs - cds_Ns
        selected_base_cds_withoutNs = selected_sequence_cds[selected_cds_pos_withoutNs]

        cdna_base_new = selected_sequence_cdna[selected_position_new_cdna]
        cdna_base_old = selected_sequence_cdna[selected_position_old_cdna]

        if cdna_base_new == cdna_base_old:
            continue  # uninteresting
        
        if cdna_base_new == selected_ref_base_cds:
            new_cdna_victory_count += 1
        elif cdna_base_old == selected_ref_base_cds:
            old_cdna_victory_count += 1
        else:
            both_cdna_lose += 1
        
        # if selected_base_cds_withNs == selected_base_cds_withoutNs:
        #     continue
        
        # if selected_base_cds_withNs == selected_ref_base_cds:
        #     raw_cds_victory_count += 1
        # elif selected_base_cds_withoutNs == selected_ref_base_cds:
        #     stripped_cds_victory_count += 1
        # else:
        #     both_cds_lose += 1
    except Exception as e:
        # print(f"Error processing {row.seq_ID}: {e}")
        continue

# print(f"raw_cds_victory_count: {raw_cds_victory_count}")
# print(f"stripped_cds_victory_count: {stripped_cds_victory_count}")
# print(f"both_cds_lose: {both_cds_lose}")
print(f"new_cdna_victory_count: {new_cdna_victory_count}")
print(f"old_cdna_victory_count: {old_cdna_victory_count}")
print(f"both_cdna_lose: {both_cdna_lose}")

0it [00:00, ?it/s]

new_cdna_victory_count: 0
old_cdna_victory_count: 0
both_cdna_lose: 0





In [2]:
import pandas as pd
import pyfastx
from tqdm import tqdm
from varseek.utils import convert_mutation_cds_locations_to_cdna, convert_mutation_cds_locations_to_cdna_old, count_leading_Ns

input_df_path = "/Users/joeyrich/Desktop/variants_transcriptome.parquet"
cdna_fasta_path = "/Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release113/Homo_sapiens.GRCh37.cdna.all.fa"
cds_fasta_path = "/Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release113/Homo_sapiens.GRCh37.cds.all.fa"

cdna = pyfastx.Fastx(cdna_fasta_path)
cds = pyfastx.Fastx(cds_fasta_path)

input_df = pd.read_parquet(input_df_path, columns=["transcript_ID", "variant"])
input_df.rename(columns={"transcript_ID": "seq_ID", "variant": "mutation"}, inplace=True)

output_df, _ = convert_mutation_cds_locations_to_cdna(input_df, cdna_fasta_path, cds_fasta_path, output_csv_path=None, verbose=True, strip_leading_Ns_cds=False)  # strip_leading_Ns_cds=False
output_df_old, _ = convert_mutation_cds_locations_to_cdna(input_df, cdna_fasta_path, cds_fasta_path, output_csv_path=None, verbose=True, strip_leading_Ns_cds=True)  # strip_leading_Ns_cds=True

output_df = output_df.merge(output_df_old, on=["seq_ID", "mutation"], suffixes=["_new", "_old"])  # new has Ns, old has no Ns (ie stripped off)

21:22:30 - INFO - Copying df internally to avoid in-place modifications
21:22:30 - INFO - Removing unknown mutations
21:22:30 - INFO - Removing unsupported mutation types
21:22:30 - INFO - Uncertain mutations: 0
21:22:30 - INFO - Ambiguous position mutations: 0
21:22:30 - INFO - Intronic mutations: 0
21:22:30 - INFO - Posttranslational region mutations: 0
21:22:30 - INFO - Sorting df
21:22:30 - INFO - Determining mutation positions
21:22:31 - INFO - Removing version numbers in in fasta headers for /Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release113/Homo_sapiens.GRCh37.cdna.all.fa
21:22:32 - INFO - Building pyfastx index for /Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release113/Homo_sapiens.GRCh37.cdna.all.fa
21:22:32 - INFO - Removing version numbers in in fasta headers for /Users/joeyrich/Desktop/local/varseek/data/reference/ensembl_grch37_release113/Homo_sapiens.GRCh37.cds.all.fa
21:22:32 - INFO - Building pyfastx index for /Users/j

In [3]:
output_df_filtered = output_df.loc[output_df["mutation_cdna_new"] != output_df["mutation_cdna_old"]]
output_df_filtered = output_df_filtered.loc[(~output_df_filtered["mutation_cdna_new"].isna()) & (~output_df_filtered["mutation_cdna_old"].isna())]
output_df_filtered = output_df_filtered.loc[output_df_filtered["mutation_cdna_new"].str.contains(">")]  # keep only substitutions
output_df_filtered[['cds_pos', 'ref_base_cds']] = output_df_filtered['mutation'].str.extract(r'c\.(\d+)([ACGT])')
output_df_filtered[['cdna_pos_new', 'ref_base_new_irrelevant']] = output_df_filtered['mutation_cdna_new'].str.extract(r'c\.(\d+)([ACGT])')
output_df_filtered[['cdna_pos_old', 'ref_base_old_irrelevant']] = output_df_filtered['mutation_cdna_old'].str.extract(r'c\.(\d+)([ACGT])')
output_df_filtered.drop(columns=["ref_base_new_irrelevant", "ref_base_old_irrelevant"], inplace=True)
output_df_filtered = output_df_filtered.head(100)
output_df_filtered.head(2)

Unnamed: 0,seq_ID,mutation,mutation_cdna_new,mutation_cdna_old,cds_pos,ref_base_cds,cdna_pos_new,cdna_pos_old
24,ENST00000466300,c.242C>A,c.240C>A,c.242C>A,242,C,240,242
25,ENST00000466300,c.248C>T,c.246C>T,c.248C>T,248,C,246,248


In [4]:
stripped_cds_victory_count = 0
raw_cds_victory_count = 0
both_cds_lose = 0
new_cdna_victory_count = 0
old_cdna_victory_count = 0
both_cdna_lose = 0

for row in tqdm(output_df_filtered.itertuples(), total=len(output_df_filtered)):
    try:
        selected_transcript = row.seq_ID
        selected_hgvsc = f"{row.seq_ID}:{row.mutation}"
        selected_position_new_cdna = int(row.cdna_pos_new) - 1
        selected_position_old_cdna = int(row.cdna_pos_old) - 1
        selected_cds_pos_withNs = int(row.cds_pos) - 1
        selected_ref_base_cds = row.ref_base_cds
        
        selected_sequence_cdna, selected_sequence_cds = None, None

        for name, seq in cdna:
            if selected_transcript in name:
                selected_sequence_cdna = str(seq)
                break

        for name, seq in cds:
            if selected_transcript in name:
                selected_sequence_cds = str(seq)
                break
        
        selected_base_cds_withNs = selected_sequence_cds[selected_cds_pos_withNs]
        cds_Ns = count_leading_Ns(selected_sequence_cds)
        selected_cds_pos_withoutNs = selected_cds_pos_withNs - cds_Ns
        selected_base_cds_withoutNs = selected_sequence_cds[selected_cds_pos_withoutNs]

        cdna_base_new = selected_sequence_cdna[selected_position_new_cdna]
        cdna_base_old = selected_sequence_cdna[selected_position_old_cdna]

        if cdna_base_new == cdna_base_old:
            continue  # uninteresting
        
        if cdna_base_new == selected_ref_base_cds:
            new_cdna_victory_count += 1
        elif cdna_base_old == selected_ref_base_cds:
            old_cdna_victory_count += 1
        else:
            both_cdna_lose += 1
        
        # if selected_base_cds_withNs == selected_base_cds_withoutNs:
        #     continue
        
        # if selected_base_cds_withNs == selected_ref_base_cds:
        #     raw_cds_victory_count += 1
        # elif selected_base_cds_withoutNs == selected_ref_base_cds:
        #     stripped_cds_victory_count += 1
        # else:
        #     both_cds_lose += 1
    except Exception as e:
        # print(f"Error processing {selected_hgvsc}: {e}")
        continue

# print(f"raw_cds_victory_count: {raw_cds_victory_count}")
# print(f"stripped_cds_victory_count: {stripped_cds_victory_count}")
# print(f"both_cds_lose: {both_cds_lose}")
print(f"new_cdna_victory_count: {new_cdna_victory_count}")
print(f"old_cdna_victory_count: {old_cdna_victory_count}")
print(f"both_cdna_lose: {both_cdna_lose}")

100%|██████████| 100/100 [00:14<00:00,  7.00it/s]

new_cdna_victory_count: 70
old_cdna_victory_count: 0
both_cdna_lose: 0





In [8]:
import pandas as pd

# Example DataFrame
hgvs_df = pd.DataFrame({
    "actual_variant": [
        "c.123A>G",
        "c.456del",
        "c.789_790insT",
        "c.135dup",
        "c.246_247delinsAT",
        "c.369G>A",
        "c.420_421insGG"
    ]
})

# Extract after "ins" or ">"
hgvs_df["after_ins_or_gt"] = hgvs_df["actual_variant"].str.extract(r"(?:ins|>)(.+)")
hgvs_df


Unnamed: 0,actual_variant,after_ins_or_gt
0,c.123A>G,G
1,c.456del,
2,c.789_790insT,T
3,c.135dup,
4,c.246_247delinsAT,AT
5,c.369G>A,A
6,c.420_421insGG,GG
