# Imports

In [1]:
import os
import subprocess
import varseek as vk

# Hyperparameters and Paths

In [None]:
verbose=True
w=54  # window size for varseek build (should be at least 1 less than kallisto k)
k=59
k_standard = 31
threads = 8
remove_Ns = True
strandedness = False  # strandedness for gget mutate and the building of the kb index (True = strandedness matters i.e., treat f and rc as 2 different sequences; False = strandedness does not matter i.e., treat f and rc as the same 

# vk build
cosmic_version = 100  # COSMIC version for gget cosmic
insertion_size_limit = None
# os.environ['COSMIC_EMAIL'] = 'your_email'  # to avoid being prompted for email in varseek build
# os.environ['COSMIC_PASSWORD'] = 'your_password'  # to avoid being prompted for password in varseek build

# vk info
columns_to_include="all"
dlist_reference_source = "t2t"  # ensembl_grchNUMBER_releaseNUMBER or t2t - eg ensembl_grch37_release93
near_splice_junction_threshold=10
save_exploded_df=True

# vk filter
fasta_filters = [
    "dlist_substring-equal=none",  # filter out mutations which are a substring of the reference genome
    "pseudoaligned_to_human_reference_despite_not_truly_aligning-isnottrue",  # filter out mutations which pseudoaligned to human genome despite not truly aligning
    "dlist-equal=none",  #*** erase eventually when I want to d-list  # filter out mutations which are capable of being d-listed (given that I filter out the substrings above)
    "number_of_kmers_with_overlap_to_other_mcrs_items_in_mcrs_reference-max=999999",  # filter out mutations which overlap with other MCRSs in the reference
    "number_of_mcrs_items_with_overlapping_kmers_in_mcrs_reference-max=999999",  # filter out mutations which overlap with other MCRSs in the reference
    "longest_homopolymer_length-max=999999",  # filters out MCRSs with repeating single nucleotide - eg 6
    "triplet_complexity-min=0"  # filters out MCRSs with repeating triplets - eg 0.2
]

# kb ref
dlist = False







# Paths
out_dir_base = "/home/jrich/data/varseek_data_fresh"
run_name = "vk_build_pipeline_t2t_nov16"

# vk build
mutations = "/home/jrich/data/varseek_data_fresh/reference/cosmic/CancerMutationCensus_AllData_Tsv_v100_GRCh37/CancerMutationCensus_AllData_v100_GRCh37_mutation_workflow_with_cdna.csv"  # "cosmic_cmc"  # file path to mutations csv/tsv file OR one of the supported databases
sequences = "/home/jrich/data/varseek_data_fresh/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.cdna.all.fa"  # cdna  # file path to reference sequence fasta file OR, only if 'mutations' is in the supported databases, one of the following options is also supported: "cds", "cdna", "genome", "cdna_and_genome" - sequences for gget mutate

# vk info
bowtie_path="/home/jrich/opt/bowtie2-2.5.4/bowtie2-2.5.4-linux-x86_64"
mutations_csv="/home/jrich/data/varseek_data_fresh/reference/cosmic/CancerMutationCensus_AllData_Tsv_v100_GRCh37/CancerMutationCensus_AllData_v100_GRCh37_with_cdna.csv"
reference_cdna_fasta="/home/jrich/data/varseek_data_fresh/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.cdna.all.fa"  # the one that matches up to mutation df annotations
reference_genome_fasta="/home/jrich/data/varseek_data_fresh/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.dna.primary_assembly.fa"  # the one that matches up to mutation df annotations
gtf_path="/home/jrich/data/varseek_data_fresh/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.87.gtf"  # the one that matches up to mutation df annotations

# Automatic variable initializations based on provided hyperparameters and paths

In [None]:
out_dir_notebook = os.path.join(out_dir_base, run_name)
reference_out_dir = os.path.join(out_dir_base, "reference")

os.makedirs(out_dir_base, exist_ok=True)
os.makedirs(out_dir_notebook, exist_ok=True)
os.makedirs(reference_out_dir, exist_ok=True)

if remove_Ns:
    max_ambiguous_vk = 0
else:
    max_ambiguous_vk = None

merge_identical_rc = not strandedness

vk_build_mcrs_fa_path = os.path.join(out_dir_notebook, "mcrs.fa")
update_df_out = os.path.join(out_dir_notebook, "mutation_metadata_df.csv")
os.makedirs(out_dir_notebook, exist_ok=True)

assert k >= w + 1, "k must be greater than or equal to w + 1"

id_to_header_csv=os.path.join(out_dir_notebook, "id_to_header_mapping.csv")
mutation_metadata_df_out_path_vk_info = os.path.join(out_dir_notebook, "mutation_metadata_df_updated_vk_info.csv")
mutation_metadata_df_exploded_path = os.path.join(out_dir_notebook, "mutation_metadata_df_updated_vk_info_exploded.csv") if save_exploded_df else None
mutation_index = f"{out_dir_notebook}/mutation_reference.idx"
dlist_fasta = f"{out_dir_notebook}/dlist.fa"
wt_mcrs_index = f"{out_dir_notebook}/wt_mcrs_reference.idx"


mcrs_fasta_vk_filter = os.path.join(out_dir_notebook, "mcrs_filtered.fa")
output_metadata_df_vk_filter = os.path.join(out_dir_notebook, "mutation_metadata_df_filtered.csv")
output_mutation_metadata_df_exploded_vk_filter = os.path.join(out_dir_notebook, "mutation_metadata_df_updated_vk_info_exploded_filtered.csv") if save_exploded_df else None
dlist_fasta_vk_filter = os.path.join(out_dir_notebook, "dlist_filtered.fa")
wt_mcrs_fa_vk_filter = os.path.join(out_dir_notebook, "mcrs_wt_filtered.fa")
t2g_vk_filter = os.path.join(out_dir_notebook, "t2g_filtered.txt")
t2g_wt_vk_filter = os.path.join(out_dir_notebook, "t2g_wt_filtered.txt")
id_to_header_csv_vk_filter = os.path.join(out_dir_notebook, "id_to_header_mapping_filtered.csv")

if dlist:
    dlist_kb_argument = dlist_fasta_vk_filter
else:
    dlist_kb_argument = "None"

# vk build

In [None]:
# %%time
vk.build(
    sequences=sequences,
    mutations=mutations,
    out=out_dir_notebook,
    reference_out=reference_out_dir,
    w=w,
    k=k,
    insertion_size_limit=insertion_size_limit,
    remove_seqs_with_wt_kmers=True,
    optimize_flanking_regions=True,
    min_seq_len=k,
    max_ambiguous=max_ambiguous_vk,
    merge_identical=True,
    merge_identical_rc=merge_identical_rc,
    cosmic_email = os.getenv('COSMIC_EMAIL'),
    cosmic_password = os.getenv('COSMIC_PASSWORD'),
    create_t2g=True,
    create_wt_mcrs_counterpart_fa=True,
    update_df=True,
    update_df_out=update_df_out,
    verbose=verbose,
)

# vk info

In [None]:
# %%time
vk.info(
    mutations = vk_build_mcrs_fa_path,
    updated_df=update_df_out,
    id_to_header_csv=id_to_header_csv,  # if none then assume no swapping occurred
    columns_to_include=columns_to_include,
    mcrs_id_column="mcrs_id",
    mcrs_sequence_column="mutant_sequence",
    mcrs_source_column="mcrs_source",  # if input df has concatenated cdna and header MCRS's, then I want to know whether it came from cdna or genome
    seqid_cdna_column="seq_ID",  # if input df has concatenated cdna and header MCRS's, then I want a way of mapping from cdna to genome  # TODO: implement these 4 column name arguments
    seqid_genome_column="chromosome",  # if input df has concatenated cdna and header MCRS's, then I want a way of mapping from cdna to genome
    mutation_cdna_column="mutation",  # if input df has concatenated cdna and header MCRS's, then I want a way of mapping from cdna to genome
    mutation_genome_column="mutation_genome",  # if input df has concatenated cdna and header MCRS's, then I want a way of mapping from cdna to genome
    gtf=gtf_path,  # for distance to nearest splice junction
    mutation_metadata_df_out_path=mutation_metadata_df_out_path_vk_info,
    out_dir_notebook=out_dir_notebook,
    reference_out_dir=reference_out_dir,
    dlist_reference_source=dlist_reference_source,
    ref_prefix="index",
    w=w,
    remove_Ns=remove_Ns,
    strandedness=strandedness,
    bowtie_path=bowtie_path,
    near_splice_junction_threshold=near_splice_junction_threshold,
    threads=threads,
    reference_cdna_fasta=reference_cdna_fasta,
    reference_genome_fasta=reference_genome_fasta,
    mutations_csv=mutations_csv,
    save_exploded_df=save_exploded_df,
    verbose=verbose,
)

# vk filter

In [None]:
# %%time
vk.filter(mutation_metadata_df_path = mutation_metadata_df_out_path_vk_info,
          mutation_metadata_df_exploded_path = mutation_metadata_df_exploded_path,
          output_mutation_metadata_df_exploded = output_mutation_metadata_df_exploded_vk_filter,
          output_mcrs_fasta=mcrs_fasta_vk_filter,
          output_metadata_df=output_metadata_df_vk_filter,
          dlist_fasta=dlist_fasta,
          output_dlist_fasta=dlist_fasta_vk_filter,
          output_wt_mcrs_fa=wt_mcrs_fa_vk_filter,
          create_t2g=True,
          output_t2g=t2g_vk_filter,
          output_t2g_wt=t2g_wt_vk_filter,
          id_to_header_csv=id_to_header_csv,
          output_id_to_header_csv=id_to_header_csv_vk_filter,
          verbose=True,
          return_df=False,
          filters = fasta_filters)

# kb ref

In [None]:
# %%time
if not os.path.exists(mutation_index):
    kb_ref_command = ["kb", "ref", "--workflow", "custom", "-t", str(threads), "-i", mutation_index, "--d-list", dlist_kb_argument, "-k", str(k), mcrs_fasta_vk_filter]
    subprocess.run(kb_ref_command, check=True)

if os.path.exists(wt_mcrs_fa_vk_filter) and not os.path.exists(mutation_index):
    kb_ref_command = ["kb", "ref", "--workflow", "custom", "-t", str(threads), "-i", wt_mcrs_index, "--d-list", dlist_kb_argument, "-k", str(k), wt_mcrs_fa_vk_filter]
    subprocess.run(kb_ref_command, check=True)

# Optional: kb ref on T2T reference genome

In [1]:
import os
import subprocess
out_dir_base = "/home/jrich/data/varseek_data_fresh"
reference_out_dir = os.path.join(out_dir_base, "reference")
threads = 16

from varseek.utils import download_t2t_reference_files
k_standard = 31
t2t_folder = f"{reference_out_dir}/T2T/GCF_009914755.1"
t2t_kb_folder = f"{t2t_folder}/kb_index"
standard_index = f"{t2t_kb_folder}/index.idx"
standard_t2g = f"{t2t_kb_folder}/t2g.txt"
standard_f1 = f"{t2t_kb_folder}/f1.fa"

if not os.path.exists(standard_index) or not os.path.exists(standard_t2g):
    os.makedirs(t2t_kb_folder, exist_ok=True)
    t2t_genome = f"{t2t_folder}/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna"
    t2t_gtf = f"{t2t_folder}/genomic.gtf"
    
    if not os.path.exists(t2t_genome) or not os.path.exists(t2t_gtf):
        os.makedirs(t2t_folder, exist_ok=True)
        t2t_genome, t2t_cdna, t2t_gtf = download_t2t_reference_files(t2t_folder)

    kb_ref_standard_command = f"kb ref -k {k_standard} -i {standard_index} -g {standard_t2g} -f1 {standard_f1} -t {threads} {t2t_genome} {t2t_gtf}"
    result = subprocess.run(kb_ref_standard_command, shell=True, check=True)

[2024-11-18 11:24:46,790]    INFO [ref] Preparing /home/jrich/data/varseek_data_fresh/reference/T2T/GCF_009914755.1/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna, /home/jrich/data/varseek_data_fresh/reference/T2T/GCF_009914755.1/genomic.gtf
[2024-11-18 11:26:23,192]    INFO [ref] Splitting genome /home/jrich/data/varseek_data_fresh/reference/T2T/GCF_009914755.1/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna into cDNA at /home/jrich/Desktop/varseek/notebooks/tmp/tmp16l0fid_
[2024-11-18 11:27:46,491]    INFO [ref] Concatenating 1 cDNAs to /home/jrich/data/varseek_data_fresh/reference/T2T/GCF_009914755.1/kb_index/f1.fa
[2024-11-18 11:27:48,107]    INFO [ref] Creating transcript-to-gene mapping at /home/jrich/data/varseek_data_fresh/reference/T2T/GCF_009914755.1/kb_index/t2g.txt
[2024-11-18 11:27:50,286]    INFO [ref] Indexing /home/jrich/data/varseek_data_fresh/reference/T2T/GCF_009914755.1/kb_index/f1.fa to /home/jrich/data/varseek_data_fresh/reference/T2T/GCF_009914755.1/kb_index/index.idx
