In [None]:
import os
import subprocess
import gget
import pandas as pd

import varseek as vk

In [None]:
data_directory = os.path.dirname(os.path.abspath(""))  # if this notebook resides in varseek/notebooks/0_data_download.ipynb, then this retrieves varseek
out_dir = os.path.join(data_directory, "data", "vk_build_pipeline_notebook1")
reference_out_dir = os.path.join(data_directory, "data", "reference")

w=54  # window size for varseek build (should be 1 less than kallisto k)
threads = 16

variants = "cosmic_cmc"  # file path to variants csv/tsv file OR one of the supported databases
sequences = "cdna_and_genome"  # file path to reference sequence fasta file OR, only if 'variants' is in the supported databases, one of the following options is also supported: "cds", "cdna", "genome", "cdna_and_genome" - sequences for vk build
cosmic_version = 100  # COSMIC version for gget cosmic
remove_Ns = True
strandedness = False  # strandedness for vk build and the building of the kb index (True = strandedness matters i.e., treat f and rc as 2 different sequences; False = strandedness does not matter i.e., treat f and rc as the same 
fasta_filters = [
    "substring_alignment_to_reference-equal=none",  # filter out variants which are a substring of the reference genome
    "pseudoaligned_to_reference_despite_not_truly_aligning-istrue",  # filter out variants which pseudoaligned to human genome despite not truly aligning
    "alignment_to_reference-equal=none",  # filter out variants which are capable of being d-listed (given that I filter out the substrings above)
    "number_of_kmers_with_overlap_to_other_VCRSs-max=999999"  # filter out variants which overlap with other VCRSs in the reference
    "number_of_other_VCRSs_with_overlapping_kmers-max=999999"  # filter out variants which overlap with other VCRSs in the reference
]

# os.environ['COSMIC_EMAIL'] = 'your_email'  # to avoid being prompted for email in varseek build
# os.environ['COSMIC_PASSWORD'] = 'your_password'  # to avoid being prompted for password in varseek build

dlist_reference_source = "ensembl_grch37_release93"  # ensembl_grchNUMBER_releaseNUMBER or t2t

In [None]:
out_dir_notebook = os.path.join(out_dir, "notebook_1_newheaders")
reference_out_dir = os.path.join(out_dir, "reference")

os.makedirs(out_dir, exist_ok=True)
os.makedirs(out_dir_notebook, exist_ok=True)
os.makedirs(reference_out_dir, exist_ok=True)

if remove_Ns:
    max_ambiguous_kv = 0
    N_penalty = 1
    max_Ns_per_read_length = 0
else:
    max_ambiguous_kv = None
    N_penalty = 0
    max_Ns_per_read_length = 0.1

if strandedness:
    merge_identical_rc = False
    bowtie_strandedness = "--norc"  # could do --nofw as well
    kb_strandedness = "--strand forward"
else:
    merge_identical_rc = True
    bowtie_strandedness = ""
    kb_strandedness = ""


sequences_total = sequences
sequences_cdna = "cdna"
sequences_genome = "genome"
out_dir_kv_build_cdna = os.path.join(out_dir_notebook, "kv_cdna")
kv_build_vcrs_fa_path_cdna = os.path.join(out_dir_kv_build_cdna, "vcrs_cdna.fa")
out_dir_kv_build_genome = os.path.join(out_dir_notebook, "kv_genome")
kv_build_vcrs_fa_path_genome = os.path.join(out_dir_kv_build_genome, "vcrs_genome.fa")

os.makedirs(out_dir_kv_build_cdna, exist_ok=True)
os.makedirs(out_dir_kv_build_genome, exist_ok=True) 

out_dir_kv_build = os.path.join(out_dir_notebook, f"kv_{sequences}")
kv_build_vcrs_fa_path = os.path.join(out_dir_kv_build, "vcrs.fa")
os.makedirs(out_dir_kv_build, exist_ok=True)

cosmic_csv = os.path.join(reference_out_dir, "cosmic", f"CancerMutationCensus_AllData_v{cosmic_version}_GRCh37_gget_mutate_with_cdna")  # output of varseek
k = w + 1

mutation_metadata_df_path = os.path.join(out_dir_kv_build, "mutation_metadata_df.csv")

In [None]:
# run kv build for transcriptome with save_variants_updated_csv=True, and merge_headers=False
vk.build(
    sequences=sequences_cdna,
    variants=variants,
    out=out_dir_kv_build_cdna,
    reference_out_dir=reference_out_dir,
    save_variants_updated_csv=True,
    ...
)

# load in the df_cdna, and set df_cdna['variant_source'] = 'cdna'
df_cdna = pd.read_csv(os.path.join(out_dir_kv_build_cdna, "mutation_metadata_updated.csv"))
df_cdna['variant_source'] = 'cdna'

In [None]:
# run kv build with for genome save_variants_updated_csv=True, and merge_headers=False
vk.build(
    sequences=sequences_genome,
    variants=cosmic_csv,  # generated above
    out=out_dir_kv_build_genome,
    reference_out_dir=reference_out_dir,
    cosmic_email = os.getenv('COSMIC_EMAIL'),
    cosmic_password = os.getenv('COSMIC_PASSWORD'),
    seq_id_column = "chromosome",
    var_column = "mutation_genome",
    save_variants_updated_csv=True,
    ...
)

# load in the df_genome, and set df_genome['variant_source'] = 'genome'
df_genome = pd.read_csv(os.path.join(out_dir_kv_build_genome, "mutation_metadata_updated.csv"))
df_genome['variant_source'] = 'genome'

In [None]:
# concatenate the results
df_combined = pd.concat([df_cdna, df_genome], ignore_index=True)

In [None]:
# # if desired, merge headers at this stage, come up with new vcrs_ids, and write a new fasta and a new id:header dict; and for rows with cdna and genome in the same header, make sure df_combined['variant_source'] = 'mixed'

In [None]:
# run vk info with columns for vcrs_id, vcrs_sequence, variant_source, vcrs_header
# - and, if and only if I did not merge headers, - seqID, mutation, chromosome, and mutation_genome (for making header_cdna and header_genome within the function)

In [None]:
# - no need to filter out genome entries that are equal to cdna in vk info
# - no need to label any entries as "unsplicedENST…"
# 	- Change helper functions accordingly