In [1]:
from varseek.utils import vcf_to_dataframe
import pandas as pd
from tqdm import tqdm
import varseek
import numpy as np
tqdm.pandas()

In [2]:
import varseek as vk
vk.build(
    variants="vcf_testing.vcf",
    sequences="sequences_testing.fa",
    out="vcf_testing_out_dir",
    seq_id_column="chromosome",
    var_column="mutation_from_vcf",
    overwrite=True
)

[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse_filter] FILTER 'FAIL' is not defined in the header
01:47:23 - INFO - Using the seq_id_column:var_column 'chromosome:mutation_from_vcf' columns as the variant header column.
01:47:23 - INFO - Removing 0 duplications > k
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mutations["vcrs_sequence_kmer_length"] = mutations["vcrs_sequence"].apply(lambda x: len(x) if pd.notna(x) else 0)
01:47:23 - INFO - Removed 1 variant kmers with length less than 59...
01:47:23 - INFO - Removed 0 variant kmers containing more than 0 'N's...
        12 variants correctly recorded (75.00%)
        4 variants removed (25.00%)
          0 variants missing seq_id or var_column (0.000%

vcf_testing_out_dir/vcrs_t2g.txt already exists


In [None]:
# !vk build -v vcf_testing.vcf -s sequences_testing.fa -o vcf_testing_out_dir --seq_id_column chromosome --var_column mutation_from_vcf

In [2]:
sample_vcf = "vcf_testing.vcf"  # "/Users/joeyrich/Downloads/sample.vcf"
seq_id_column = "chromosome"
var_id_column = "variant_id"
var_column = "mutation_genome"

mutations = vcf_to_dataframe(sample_vcf, additional_columns=True, explode_alt=True, filter_empty_alt=True)
# sample_vcf_df['CHROM'] = sample_vcf_df['CHROM'].astype(str)
mutations.rename(columns={"CHROM": seq_id_column}, inplace=True)
if var_id_column:
    mutations.rename(columns={"ID": var_id_column}, inplace=True)
mutations[var_column] = mutations.progress_apply(generate_mutation_notation_from_vcf_columns, axis=1)  #!! untested

[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse_filter] FILTER 'FAIL' is not defined in the header
[W::vcf_parse] Contig '' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::bcf_hrec_check] Invalid contig name: ""


ValueError: Error extracting ID

In [5]:
mutations[['chromosome', 'POS', 'variant_id', 'REF', 'ALT', 'mutation_genome', 'QUAL']]

Unnamed: 0,chromosome,POS,variant_id,REF,ALT,mutation_genome,QUAL
0,19,11,single_sub_mid;g.11A>C,A,C,g.11A>C,1.0
1,19,22,"exploded_sub_mid;g.22G>A,g.22G>C,g.22G>T",G,A,g.22G>A,1.0
2,19,22,"exploded_sub_mid;g.22G>A,g.22G>C,g.22G>T",G,C,g.22G>C,1.0
3,19,22,"exploded_sub_mid;g.22G>A,g.22G>C,g.22G>T",G,T,g.22G>T,1.0
4,19,1,sub_begin;g.1T>A,T,A,g.1T>A,1.0
5,19,34,"exploded_ins_mid;g.34_35insA,g.34_35insGAC",G,GA,g.34_35insA,0.0
6,19,34,"exploded_ins_mid;g.34_35insA,g.34_35insGAC",G,GAC,g.34_35insAC,0.0
7,19,1,ins_begin;None,T,AGT,g.UNKNOWN,0.0
8,19,40,single_del_mid;g.40del,AG,A,g.41del,0.0
9,19,40,multi_del_mid;g.40_43del,AGCAT,A,g.41_44del,0.0


In [3]:
def add_variant_type_column_to_vcf_derived_df(sample_vcf_df):
    # Compute lengths once
    sample_vcf_df["REF_len"] = sample_vcf_df["REF"].str.len()
    sample_vcf_df["ALT_len"] = sample_vcf_df["ALT"].str.len()

    sample_vcf_df["ALT_RC"] = sample_vcf_df["ALT"].apply(varseek.utils.reverse_complement)

    # Define conditions using precomputed values
    #!!! check for duplications of length > 1 later
    conditions = [
        (sample_vcf_df["REF_len"] == 1) & (sample_vcf_df["ALT_len"] == 1),  # Substitution
        (sample_vcf_df["REF_len"] > 1) & (sample_vcf_df["ALT_len"] == 1),   # Deletion
        (sample_vcf_df["REF_len"] == 1) & (sample_vcf_df["ALT_len"] == 2) & (sample_vcf_df["ALT"].str[1] == sample_vcf_df["REF"].str[0]),  # Duplication of length 1 - must go before insertion because it is a special case of insertion
        (sample_vcf_df["REF_len"] == 1) & (sample_vcf_df["ALT_len"] > 1),   # Insertion
        (sample_vcf_df["REF_len"] > 1) & (sample_vcf_df["ALT_len"] > 1) & (sample_vcf_df["REF"] == sample_vcf_df["ALT_RC"]),  # Inversion - must go before delins because it is a special case of delins
        (sample_vcf_df["REF_len"] > 1) & (sample_vcf_df["ALT_len"] > 1),    # Delins
    ]

    # Define corresponding values
    choices = ["substitution", "deletion", "duplication", "insertion", "inversion", "delins"]

    # Apply np.select
    sample_vcf_df["variant_type"] = np.select(conditions, choices, default="unknown")
    
def add_variant_column_to_vcf_derived_df(sample_vcf_df, var_column="variant"):
    # Compute end position for delins
    sample_vcf_df["start_POS_deletion"] = (sample_vcf_df["POS"] + 1).astype(str)
    sample_vcf_df["start_POS_deletion_starting_at_1"] = "1"
    sample_vcf_df["end_POS_for_multibase_deletion_and_delins_and_inversion"] = (sample_vcf_df["POS"] + sample_vcf_df["REF_len"] - 1).astype(str)
    sample_vcf_df["end_POS_for_multibase_deletion_starting_at_1"] = (sample_vcf_df["REF_len"] - 1).astype(str)
    sample_vcf_df["end_POS_for_insertion"] = (sample_vcf_df["POS"] + 1).astype(str)
    sample_vcf_df["ALT_first_base_trimmed"] = sample_vcf_df["ALT"].str[1:]
    # sample_vcf_df["ALT_last_base_trimmed"] = sample_vcf_df["ALT"].str[:-1]

    # Define conditions
    conditions = [
        sample_vcf_df["variant_type"] == "substitution",  # Substitution
        (sample_vcf_df["variant_type"] == "deletion") & (sample_vcf_df["REF_len"] == 2) & (sample_vcf_df["POS"] != 1),  # Single base deletion
        (sample_vcf_df["variant_type"] == "deletion") & (sample_vcf_df["REF_len"] > 2) & (sample_vcf_df["POS"] != 1),  # Multi base deletion
        (sample_vcf_df["variant_type"] == "deletion") & (sample_vcf_df["REF_len"] == 2) & (sample_vcf_df["POS"] == 1),  # Single base deletion starting at 1
        (sample_vcf_df["variant_type"] == "deletion") & (sample_vcf_df["REF_len"] > 2) & (sample_vcf_df["POS"] == 1),  # Multi base deletion starting at 1
        (sample_vcf_df["variant_type"] == "insertion") & (sample_vcf_df["POS"] != 1),  # Insertion
        (sample_vcf_df["variant_type"] == "insertion") & (sample_vcf_df["POS"] == 1),  # Insertion starting at 1
        sample_vcf_df["variant_type"] == "delins",  # Delins
        sample_vcf_df["variant_type"] == "duplication",  # Single base duplication
        sample_vcf_df["variant_type"] == "inversion"  # Inversion
    ]

    # Ensure POS is an integer in string format
    sample_vcf_df["POS"] = sample_vcf_df["POS"].astype(str)

    # Define corresponding variant formats
    choices = [
        "g." + sample_vcf_df["POS"] + sample_vcf_df["REF"] + ">" + sample_vcf_df["ALT"],  # Substitution
        "g." + sample_vcf_df["start_POS_deletion"] + "del",  # Single base deletion
        "g." + sample_vcf_df["start_POS_deletion"] + "_" + sample_vcf_df["end_POS_for_multibase_deletion_and_delins_and_inversion"] + "del",  # Multi base deletion
        "g.1del",  # Single base deletion starting at 1
        "g.1_" + sample_vcf_df["end_POS_for_multibase_deletion_starting_at_1"] + "del",  # Multi base deletion starting at 1
        "g." + sample_vcf_df["POS"] + "_" + sample_vcf_df["end_POS_for_insertion"] + "ins" + sample_vcf_df["ALT_first_base_trimmed"],  # Insertion
        "g.unknown",  # Insertion starting at 1  # "g.0_1ins" + sample_vcf_df["ALT_last_base_trimmed"],
        "g." + sample_vcf_df["POS"] + "_" + sample_vcf_df["end_POS_for_multibase_deletion_and_delins_and_inversion"] + "delins" + sample_vcf_df["ALT"],  # Delins
        "g." + sample_vcf_df["POS"] + "dup",  # Single base duplication
        "g." + sample_vcf_df["POS"] + "_" + sample_vcf_df["end_POS_for_multibase_deletion_and_delins_and_inversion"] + "inv"  # Inversion
    ]

    # Apply np.select
    sample_vcf_df[var_column] = np.select(conditions, choices, default="g.unknown")  # Default to None if no match
    sample_vcf_df.drop(columns=["REF_len", "ALT_len", "ALT_RC", "start_POS_deletion", "start_POS_deletion_starting_at_1", "end_POS_for_multibase_deletion_and_delins_and_inversion", "end_POS_for_multibase_deletion_starting_at_1", "end_POS_for_insertion", "ALT_first_base_trimmed", "ALT_last_base_trimmed"], inplace=True, errors="ignore")

In [6]:
sample_vcf = "vcf_testing.vcf"  # "/Users/joeyrich/Downloads/sample.vcf"
seq_id_column = "chromosome"
var_id_column = "variant_id"
var_column = "mutation_genome"

sample_vcf_df = vcf_to_dataframe(sample_vcf, additional_columns=False, explode_alt=True, filter_empty_alt=True)
# sample_vcf_df['CHROM'] = sample_vcf_df['CHROM'].astype(str)
sample_vcf_df.rename(columns={"CHROM": seq_id_column}, inplace=True)
if var_id_column:
    sample_vcf_df.rename(columns={"ID": var_id_column}, inplace=True)

add_variant_type_column_to_vcf_derived_df(sample_vcf_df)
add_variant_column_to_vcf_derived_df(sample_vcf_df, var_column=var_column)

[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse_filter] FILTER 'FAIL' is not defined in the header


In [7]:
mutations = sample_vcf_df.copy()
sample_vcf_df

Unnamed: 0,chromosome,POS,variant_id,REF,ALT,variant_type,mutation_genome
0,19,11,single_sub_mid;g.11A>C,A,C,substitution,g.11A>C
1,19,22,"exploded_sub_mid;g.22G>A,g.22G>C,g.22G>T",G,A,substitution,g.22G>A
2,19,22,"exploded_sub_mid;g.22G>A,g.22G>C,g.22G>T",G,C,substitution,g.22G>C
3,19,22,"exploded_sub_mid;g.22G>A,g.22G>C,g.22G>T",G,T,substitution,g.22G>T
4,19,1,sub_begin;g.1T>A,T,A,substitution,g.1T>A
5,19,34,"exploded_ins_mid;g.34_35insA,g.34_35insGAC",G,GA,insertion,g.34_35insA
6,19,34,"exploded_ins_mid;g.34_35insA,g.34_35insGAC",G,GAC,insertion,g.34_35insAC
7,19,1,ins_begin;None,T,AGT,insertion,g.unknown
8,19,40,single_del_mid;g.41del,AG,A,deletion,g.41del
9,19,40,multi_del_mid;g.41_44del,AGCAT,A,deletion,g.41_44del


In [None]:
seq_dict = {"19": "TCATCGAACTAGCAGCTCGACGACGCACATCGTGGATCCAGCATCAGCCCCCTCTCGAGTCGCATCGCATCG"}
mutations["wt_sequence_full"] = mutations[seq_id_column].map(seq_dict)
mutations["ALT_len"] = mutations["ALT"].str.len()
mutations["ALT_first_base_trimmed"] = mutations["ALT"].str[1:]

# Step 1: Create a mask for rows where variant_type == "insertion" and the last base of ALT equals REF
mask = (mutations["variant_type"] == "insertion") & (mutations["ALT"].str[-1] == mutations["REF"])

# Step 2: Compute the start position for comparison (only for masked rows)
mutations.loc[mask, "start_pos"] = mutations.loc[mask, "POS"].astype(int) - mutations.loc[mask, "ALT_len"] + 2

# Step 3: Extract sequence slice only for masked rows
mutations.loc[mask, "seq_slice"] = mutations.loc[mask].apply(
    lambda row: row["wt_sequence_full"][int(row["start_pos"]-1):int(row["POS"])]  # row["start_pos"]+1 because of 0-indexing in python
    if pd.notna(row["wt_sequence_full"]) else "", axis=1
)

# Step 4: Create a mask for matching sequences
compare_mask = mask & (mutations["ALT_first_base_trimmed"] == mutations["seq_slice"])

# Step 5: Update variant_type and variant only for matched rows
mutations.loc[compare_mask, "variant_type"] = "duplication"
mutations.loc[compare_mask, var_column] = "g." + mutations.loc[compare_mask, "start_pos"].astype(int).astype(str) + "_" + mutations.loc[compare_mask, "POS"].astype(str) + "dup"

mutations.drop(columns=["wt_sequence_full", "ALT_first_base_trimmed", "ALT_len", "start_pos", "seq_slice"], inplace=True, errors="ignore")