# Imports

In [1]:
import os
import subprocess

import varseek as vk
from varseek.utils import trim_edges_off_reads_fastq_list, run_fastqc_and_multiqc, replace_low_quality_bases_with_N_list, split_reads_by_N_list



# Hyperparameters

In [2]:
w=54  # window size for varseek build (should be at least 1 less than kallisto k)
k=55
threads = 32
strand = "unstranded"  # "forward", "reverse", or "unstranded"

# fastq cleanup preceding kb count
trim_edges_off_reads = False
minimum_base_quality_trim_reads=13
qualified_quality_phred = None
unqualified_percent_limit = None
n_base_limit = None
length_required = None

replace_low_quality_bases_with_N = False
minimum_base_quality_replace_with_N=13
split_reads_by_Ns = False
run_fastqc = False

# kb count
assay = "bulk"  # "bulk" or "sc"  #!!! implement
parity = "single"  # single or paired

# vk clean
minimum_count_filter = None  #!!! still need to tune this  
use_binary_matrix = False
drop_zero_columns = False
filter_cells_by_min_counts = True  # True for auto-filter, numeric for set filter, None/False otherwise
filter_cells_by_min_genes = 200
filter_genes_by_min_cells = 3
filter_cells_by_max_mt_content = False
doublet_detection = True
remove_doublets = False
do_cpm_normalization = True
split_reads_by_Ns = False
dlist_file = None
mutation_metadata_df_columns = None  #!!! replace

# Paths
out_dir_base = "/home/jrich/data/varseek_data_fresh"
rnaseq_fastq_files = ""  # pass as a single string or as a list of strings (if paired-end, then pass each pair sequentially with R1 preceding R2)
seqtk="seqtk"
mutation_metadata_df_path = None  #!!! replace
standard_index = ""
standard_t2g = ""


In [3]:
out_dir_notebook = os.path.join(out_dir_base, "vk_build_pipeline_t2t")
reference_out_dir = os.path.join(out_dir_base, "reference")
mutation_index = f"{out_dir_notebook}/mutation_reference.idx"
kb_count_out = f"{out_dir_notebook}/kb_count_out"
kb_count_out_standard_index = f"{out_dir_notebook}/kb_count_out_standard"
t2g_vk_filter = os.path.join(out_dir_notebook, "t2g_filtered.txt")
vk_summarize_output_dir = f"{out_dir_notebook}/vk_summarize"
fastqc_out_dir = f"{out_dir_notebook}/fastqc_out"


os.makedirs(out_dir_base, exist_ok=True)
os.makedirs(out_dir_notebook, exist_ok=True)
os.makedirs(reference_out_dir, exist_ok=True)
os.makedirs(kb_count_out, exist_ok=True)
os.makedirs(kb_count_out_standard_index, exist_ok=True)
os.makedirs(vk_summarize_output_dir, exist_ok=True)
os.makedirs(fastqc_out_dir, exist_ok=True)

assert k >= w + 1, "k must be greater than or equal to w + 1"

if type(rnaseq_fastq_files) is str:
    rnaseq_fastq_files = [rnaseq_fastq_files]

adata_path = f"{kb_count_out}/counts_unfiltered/adata.h5ad"
adata_path_normal_genome = f"{kb_count_out_standard_index}/counts_unfiltered/adata.h5ad"

In [None]:
rnaseq_fastq_files_final = vk.fastqpp(
    rnaseq_fastq_files_list=rnaseq_fastq_files, 
    trim_edges_off_reads = trim_edges_off_reads, 
    run_fastqc = run_fastqc, 
    replace_low_quality_bases_with_N = replace_low_quality_bases_with_N, 
    split_reads_by_Ns = split_reads_by_Ns, 
    parity = parity, 
    fastqc_out_dir = fastqc_out_dir,
    minimum_base_quality_trim_reads = minimum_base_quality_trim_reads,
    qualified_quality_phred = qualified_quality_phred,
    unqualified_percent_limit = unqualified_percent_limit,
    n_base_limit = n_base_limit,
    minimum_length = k,
    minimum_base_quality_replace_with_N=minimum_base_quality_replace_with_N,
    fastp = "fastp",
    seqtk = seqtk,
    delete_intermediate_files = False
)

# if trim_edges_off_reads:
#     rnaseq_fastq_files_quality_controlled = trim_edges_off_reads_fastq_list(rnaseq_fastq_files=rnaseq_fastq_files, parity=parity, minimum_base_quality_trim_reads=minimum_base_quality_trim_reads, qualified_quality_phred=qualified_quality_phred, unqualified_percent_limit=unqualified_percent_limit, n_base_limit=n_base_limit, length_required=length_required)
# else:
#     rnaseq_fastq_files_quality_controlled = rnaseq_fastq_files

# if run_fastqc:
#     run_fastqc_and_multiqc(rnaseq_fastq_files_quality_controlled, fastqc_out_dir)

# if replace_low_quality_bases_with_N:
#     rnaseq_fastq_files_replace_low_quality_bases_with_N = replace_low_quality_bases_with_N_list(rnaseq_fastq_files_quality_controlled=rnaseq_fastq_files_quality_controlled, minimum_base_quality_replace_with_N=minimum_base_quality_replace_with_N, seqtk=seqtk)
# else:
#     rnaseq_fastq_files_replace_low_quality_bases_with_N = rnaseq_fastq_files_quality_controlled

# if split_reads_by_Ns:
#     rnaseq_fastq_files_final = split_reads_by_N_list(rnaseq_fastq_files_replace_low_quality_bases_with_N, minimum_sequence_length=k)
# else:
#     rnaseq_fastq_files_final = rnaseq_fastq_files_replace_low_quality_bases_with_N

# kb count

In [None]:
# TODO: incorporate assay bulk vs sc in here
if not os.path.exists(kb_count_out) or len(os.listdir(kb_count_out)) == 0:
    kb_count_command = ["kb", "count", "-t", str(threads), "-k", str(k), "-i", mutation_index, "-g", t2g_vk_filter, "-x", "bulk", "--num", "--h5ad", "--parity", "single", "--strand", strand, "-o", kb_count_out] + rnaseq_fastq_files_final
    subprocess.run(kb_count_command, check=True)

# Optionally, kb ref and count on normal genome

In [None]:
if not os.path.exists(standard_index) or not os.path.exists(standard_t2g):
    kb_ref_command = ["kb", "ref", "-t", str(threads), "-i", standard_index, "-g", standard_t2g, "-d", "human"]
    subprocess.run(kb_ref_command, check=True)

# TODO: incorporate assay bulk vs sc in here
if not os.path.exists(kb_count_out_standard_index) or len(os.listdir(kb_count_out_standard_index)) == 0:
    kb_count_standard_index_command = ["kb", "count", "-t", str(threads), "-k", str(k), "-i", standard_index, "-g", standard_t2g, "-x", "bulk", "--h5ad", "--parity", parity, "--strand", strand, "-o", kb_count_out_standard_index] + rnaseq_fastq_files_quality_controlled
    subprocess.run(kb_count_standard_index_command, check=True)

# vk clean

In [None]:
adata_path_clean = vk.clean(adata_path, output_figures_dir = f"{out_dir_notebook}/vk_clean_figures", mutation_metadata_df = mutation_metadata_df_path, mutation_metadata_df_columns = mutation_metadata_df_columns, minimum_count_filter = minimum_count_filter, use_binary_matrix = use_binary_matrix, drop_zero_columns = drop_zero_columns, filter_cells_by_min_counts = filter_cells_by_min_counts, filter_cells_by_min_genes = filter_cells_by_min_genes, filter_genes_by_min_cells = filter_genes_by_min_cells, filter_cells_by_max_mt_content = filter_cells_by_max_mt_content, doublet_detection = doublet_detection, remove_doublets = remove_doublets, do_cpm_normalization = do_cpm_normalization, adata_path_normal_genome = adata_path_normal_genome, mcrs_id_column = "mcrs_id", verbose=False)

# vk summarize

In [None]:
vk.summarize(adata_path_clean, assay = assay, output_dir = vk_summarize_output_dir, overwrite = False, top_values = 10)