In [None]:
import os
import varseek as vk
import scanpy as sc
import numpy as np
import pandas as pd
import json
import scipy.sparse as sp
from sklearn.metrics import adjusted_rand_score
from anndata import concat

from varseek.constants import entex_to_ccle_map
from varseek.utils import plot_items_descending_order, plot_scree, plot_loading_contributions, find_resolution_for_target_clusters, plot_contingency_table, plot_knn_tissue_frequencies, plot_ascending_bar_plot_of_cluster_distances, plot_jaccard_bar_plot, create_mutated_gene_count_matrix_from_mutation_count_matrix, safe_literal_eval

In [None]:
import os
import subprocess

threads = 32
k = 59
minimum_count_filter = 2  # anything below this will be set to 0
split_reads_by_Ns_and_low_quality_bases = False
use_binary_matrix = False
matrix_type = "mutation"  # mutation or gene
sum_strategy = "total_reads"  # only if matrix_type is gene
merge_strategy = "all"  # only if matrix_type is gene
do_cpm_normalization = False
mutation_index = "/home/jrich/data/varseek_data_fresh/vk_build_pipeline_t2t_nov16/mutation_reference.idx"
t2g_vk = "/home/jrich/data/varseek_data_fresh/vk_build_pipeline_t2t_nov16/t2g_filtered.txt"

standard_index = "/home/jrich/data/varseek_data/reference/T2T/GCF_009914755.1/kb_index/index.idx"
standard_t2g = "/home/jrich/data/varseek_data/reference/T2T/GCF_009914755.1/kb_index/t2g.txt"

mutation_metadata_df_path = "/home/jrich/data/varseek_data_fresh/vk_build_pipeline_t2t_nov16/variants_updated_filtered.csv"
mutation_metadata_df_columns = ["vcrs_id", "gene_name", "header_with_gene_name"]  #* change usecols as desired

# Base directory
base_dir = "/home/jrich/data/varseek_data/sequencing/bulk/entex"
entex_csv = "/home/jrich/data/varseek_data/sequencing/bulk/entex_lung/entex_df.csv"

In [None]:
adata_out_path = f"{base_dir}/adata_combined_ccle_rnaseq_{matrix_type}.h5ad"
adata_grouped_tissue_out_path = f"{base_dir}/adata_combined_ccle_rnaseq_{matrix_type}_grouped_tissue.h5ad"

entex_df = pd.read_csv(entex_csv)

entex_df['ccle_primary_disease'] = entex_df['tissue'].map(entex_to_ccle_map).fillna(np.nan)

mutation_metadata_df = pd.read_csv(mutation_metadata_df_path, usecols = mutation_metadata_df_columns)

In [None]:
adata_list = []
# Loop through all tissues
for tissue_dir in os.listdir(base_dir):
    tissue_path = os.path.join(base_dir, tissue_dir)
    
    # Skip if not a directory
    if not os.path.isdir(tissue_path):
        continue

    # Loop through all samples in the tissue
    for sample_dir in os.listdir(tissue_path):
        sample_path = os.path.join(tissue_path, sample_dir)
        
        # Skip if not a directory
        if not os.path.isdir(sample_path):
            continue

        # Paths for pair1 and pair2
        pair1_path = os.path.join(sample_path, "pair1")
        pair2_path = os.path.join(sample_path, "pair2")
        
        # Ensure both directories exist
        if os.path.isdir(pair1_path) and os.path.isdir(pair2_path):
            # Get the full paths of the FASTQ files (assuming only one file in each folder)
            fastq_files = [os.path.join(pair1_path, f) for f in os.listdir(pair1_path) if f.endswith(".fastq.gz")]
            fastq_files_pair2 = [os.path.join(pair2_path, f) for f in os.listdir(pair2_path) if f.endswith(".fastq.gz")]
            fastq_files.extend(fastq_files_pair2)

            # TODO: replace with vk count
            rnaseq_fastq_files_list_dict = vk.fastqpp(
                fastq_files,
                trim_edges_off_reads=True,
                run_fastqc=False,
                split_reads_by_Ns_and_low_quality_bases=split_reads_by_Ns_and_low_quality_bases,
                parity="paired",
                fastqc_out_dir=".",
                minimum_base_quality_trim_reads=13,
                qualified_quality_phred=0,
                unqualified_percent_limit=100,
                n_base_limit=None,
                minimum_length=k,
                minimum_base_quality_replace_with_N=13,
                fastp="fastp",
                seqtk="seqtk",
                delete_intermediate_files=True,
            )

            kb_count_out_standard = f"{sample_path}/kb_count_standard"
            kb_count_out_mutant = f"{sample_path}/kb_count_mutant"

            if not os.path.exists(kb_count_out_standard) or len(os.listdir(kb_count_out_standard)) == 0:
                kb_count_command = [
                    "kb",
                    "count",
                    "-t",
                    str(threads),
                    "-k",
                    str(k),
                    "-i",
                    mutation_index,
                    "-g",
                    t2g_vk,
                    "-x",
                    "bulk",
                    "--num",
                    "--h5ad",
                    "--parity",
                    "single",
                    "--strand",
                    "unstranded",
                    "-o",
                    kb_count_out_mutant,
                ] + rnaseq_fastq_files_list_dict["final"]
                
                subprocess.run(kb_count_command, check=True)

            if not os.path.exists(kb_count_out_standard) or len(os.listdir(kb_count_out_standard)) == 0:
                kb_count_standard_index_command = [
                    "kb",
                    "count",
                    "-t",
                    str(threads),
                    "-k",
                    str(k),
                    "-i",
                    standard_index,
                    "-g",
                    standard_t2g,
                    "-x",
                    "bulk",
                    "--h5ad",
                    "--parity",
                    "paired",
                    "--strand",
                    "single",
                    "-o",
                    kb_count_out_standard,
                ] + rnaseq_fastq_files_list_dict["trimmed"]
                
                subprocess.run(kb_count_standard_index_command, check=True)

        adata_path = f"{kb_count_out_mutant}/counts_unfiltered/adata.h5ad"
        adata_output_path = f"{kb_count_out_mutant}/counts_unfiltered/adata_cleaned.h5ad"

        adata_path_normal_genome = f"{kb_count_out_standard}/counts_unfiltered/adata.h5ad"

        adata_mutation = sc.read_h5ad(adata_path)
        adata_mutation.obs_names = [sample_dir]
        adata_mutation.obs['experiment_id'] = sample_dir
        
        adata_mutation = vk.varseek_clean.clean(adata_mutation, adata_output_path = adata_output_path, minimum_count_filter = minimum_count_filter, use_binary_matrix = use_binary_matrix, technology = "bulk", do_cpm_normalization = do_cpm_normalization, split_reads_by_Ns_and_low_quality_bases = split_reads_by_Ns_and_low_quality_bases)

        if 'vcrs_count' in adata_mutation.var.columns:
            adata_mutation.var.drop('vcrs_count', axis=1, inplace=True)

        adata_list.append(adata_mutation)

adata = concat(adata_list, join='outer')
adata.obs = adata.obs.merge(
    entex_df,
    on='experiment_id', 
    how='left'
)

# prevents the error "Don’t call _normalize_index with non-categorical/string names" when trying adata = adata[:, nonzero_gene_mask]
adata.obs.index = adata.obs.index.astype(str)

if use_binary_matrix:
    adata.X = (adata.X > 0).astype(int)

adata.var['vcrs_id'] = adata.var.index
original_var_names = adata.var_names.copy()
# merge adata.var with mutation_metadata_df by vcrs_id
adata.var = adata.var.merge(mutation_metadata_df, on='vcrs_id', how='left')
adata.var_names = original_var_names

adata.var["vcrs_count"] = adata.X.sum(axis=0).A1 if hasattr(adata.X, "A1") else np.asarray(adata.X.sum(axis=0)).flatten()

adata.var = adata.var.copy()  # avoids warning
adata.var["gene_name"] = adata.var["gene_name"].astype('object')  # converts from category to object
adata.var["gene_name"] = adata.var["gene_name"].apply(safe_literal_eval)  # converts list-like string to list
adata.var["gene_name_set"] = adata.var['gene_name'].apply(lambda x: sorted(set(x)))
adata.var['gene_name_set_string'] = adata.var['gene_name_set'].apply(lambda x: ';'.join(x))  # converts list to semi-colon joined string

In [None]:
if matrix_type == "gene":
    adata_mutation = adata.copy()
    adata = create_mutated_gene_count_matrix_from_mutation_count_matrix(adata, sum_strategy = sum_strategy, merge_strategy = merge_strategy, use_binary_matrix=use_binary_matrix)

In [None]:
adata.var['gene_name'] = adata.var['gene_name'].apply(str)  # Convert gene_name to string to avoid error when saving adata
adata.var['gene_name_set'] = adata.var['gene_name_set'].apply(str)  # Convert gene_name to string to avoid error when saving adata

# group by tissue
#!!! DO THIS

In [None]:
# do analysis
adata_ccle_grouped_path = "/home/jrich/data/varseek_data/sequencing/bulk/ccle/adata_combined_ccle_rnaseq_mutation_grouped_tissue.h5ad"
adata_ccle = sc.read_h5ad(adata_ccle_grouped_path)

#!!! DO THIS

In [None]:
adata.write(adata_out_path)
adata_tissue_grouped.write(adata_grouped_tissue_out_path)