In [None]:
import os
import varseek as vk

In [None]:
# Required
mutations = "cosmic_cmc"  # "cosmic_cmc"  # file path to mutations csv/tsv file OR one of the supported databases - eg cosmic_cmc
sequences = "cdna"  # cdna  # file path to reference sequence fasta file OR, only if 'mutations' is in the supported databases, one of the following options is also supported: "cds", "cdna", "genome", "cdna_and_genome" - sequences for vk build
rnaseq_fastq_files = ""  # pass as a single string or as a list of strings (if paired-end, then pass each pair sequentially with R1 preceding R2)

# General
verbose = True
w = 54  # window size for varseek build (should be at least 1 less than kallisto k)
k = 59
threads = 4

# vk ref
download = False  # download vk ref index and t2g
filters=(
    "dlist_substring:equal=none",  # filter out mutations which are a substring of the reference genome
    "pseudoaligned_to_human_reference_despite_not_truly_aligning:is_not_true",  # filter out mutations which pseudoaligned to human genome despite not truly aligning
    "dlist:equal=none",  # *** erase eventually when I want to d-list  # filter out mutations which are capable of being d-listed (given that I filter out the substrings above)
    "longest_homopolymer_length:bottom_percent=99.99",  # filters out MCRSs with repeating single nucleotide - 99.99 keeps the bottom 99.99% (fraction 0.9999) ie filters out the top 0.01%
    "triplet_complexity:top_percent=99.9",  # filters out MCRSs with repeating triplets - 99.9 keeps the top 99.9% (fraction 0.999) ie filters out the bottom 0.1%
)
cosmic_version = 100  # COSMIC version for gget cosmic
# os.environ['COSMIC_EMAIL'] = 'your_email'  # to avoid being prompted for email in varseek build
# os.environ['COSMIC_PASSWORD'] = 'your_password'  # to avoid being prompted for password in varseek build
dlist_reference_source = "T2T"
dlist = "None"  # path to dlist fasta file or "None" (including the quotes)

# vk count
strand = "unstranded"  # "forward", "reverse", or "unstranded"
technology = "bulk"  # "bulk" or "sc"  #!!! implement
parity = "single"  # single or paired
min_counts = 2
filter_cells_by_min_counts = True  # True for auto-filter, numeric for set filter, None/False otherwise
filter_cells_by_min_genes = 200
filter_genes_by_min_cells = 3

# Paths
data_directory = os.path.dirname(os.path.abspath(""))  # if this notebook resides in varseek/notebooks/0_data_download.ipynb, then this retrieves varseek
out_dir = os.path.join(data_directory, "data", "vk_build_pipeline_notebook1")
reference_out_dir = os.path.join(data_directory, "data", "reference")

In [None]:
vk_ref_output_dict = vk.ref(
    mutations=mutations,  # build
    sequences=sequences,
    w=w,
    k=k,
    cosmic_version=cosmic_version,
    dlist_reference_source=dlist_reference_source,  # info
    filters=filters,  # filter
    dlist=dlist,  # kb ref
    out=out_dir,  # general
    reference_out_dir=reference_out_dir,
    download=download,
    threads=threads,
    verbose=verbose,
)

index = vk_ref_output_dict["index"]
t2g = vk_ref_output_dict["t2g"]

In [None]:
print(f"Find index at {index}")
print(f"Find t2g at {t2g}")

In [None]:
vk_count_output_dict = vk.count(
    index=index,  # kb count  # fastqpp omitted
    t2g=t2g,
    rnaseq_fastq_files=rnaseq_fastq_files,
    technology=technology,
    strand=strand,
    parity=parity,
    min_counts=min_counts,  # clean
    filter_cells_by_min_counts=filter_cells_by_min_counts,
    filter_cells_by_min_genes=filter_cells_by_min_genes,
    filter_genes_by_min_cells=filter_genes_by_min_cells,  # summarize omitted
    out=out_dir,  # general
    reference_out_dir=reference_out_dir,
    threads=threads,
    verbose=verbose,
)

In [None]:
print(f"Find summarized results in {vk_count_output_dict['vk_summarize_output_dir']}")
print(f"Find the processed adata object for further analysis in {vk_count_output_dict['adata_path']}")