## Prep to run this notebook:
- python scripts/run_vk_ref_geuvadis.py
- python scripts/run_vk_count_ccle.py

In [1]:
# try:
#     import varseek as vk
# except ImportError:
#     print("varseek not found, installing...")
#     !pip install -U -q varseek
# try:
#     import RLSRWP_2025
# except ImportError:
#     print("RLSRWP_2025 not found, installing...")
#     !pip install -q git+https://github.com/pachterlab/RLSRWP_2025.git

In [2]:
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import shutil
import seaborn as sns
import re
from collections import Counter
import anndata as ad
from scipy import sparse
import scanpy as sc
from sklearn.metrics import silhouette_score

import varseek as vk
from RLSRWP_2025.constants import box_links_dict

RLSRWP_2025_dir = os.path.dirname(os.path.abspath(""))  # if this notebook resides in RLSRWP_2025/notebooks/0_data_download.ipynb, then this retrieves RLSRWP_2025
data_dir = os.path.join(RLSRWP_2025_dir, "data")

In [1]:
import anndata as ad
adata = ad.read_h5ad("/home/jmrich/Desktop/RLSRWP_2025/data/ccle_data_base/adata_vcrs_glioblastoma.h5ad")

In [2]:
adata

AnnData object with n_obs × n_vars = 50 × 5345528
    obs: 'experiment_alias_underscores_only', 'experiment_accession', 'library_strategy', 'primary_disease', 'subtype_disease', 'sex', 'age', 'lineage_subtype', 'Cellosaurus_NCIt_disease', 'lineage'

In [None]:
sequencing_data_out_base = os.path.join(data_dir, "geuvadis_data_base")
geuvadis_reference_files_dir = os.path.join(data_dir, "reference", "geuvadis")
variants_transcriptome_df_path = os.path.join(geuvadis_reference_files_dir, "variants_transcriptome.parquet")
reference_genome_t2g = os.path.join(data_dir, "reference", "ensembl_grch37_release113", "t2g.txt")

geuvadis_genotype_true_adata = os.path.join(geuvadis_reference_files_dir, "genotypes_adata_true.h5ad")
adata_combined_path_reference_genome = os.path.join(sequencing_data_out_base, "adata_reference_genome_combined.h5ad")
adata_combined_path_vcrs_final = os.path.join(sequencing_data_out_base, f"adata_vcrs_combined_w37_k41.h5ad")

reference_dir = os.path.join(RLSRWP_2025_dir, "data", "reference")
t2g_file = os.path.join(reference_dir, "ensembl_grch37_release113", "t2g.txt")

downsampled_fractions = [0.0001, 0.001, 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.0]  # [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.0]
min_counts = 2
min_samples_per_variant = 3
n_top_variants = 2000
gene_set_to_exclude = {'ENSG00000111640', 'ENSG00000242580', 'ENSG00000243466', 'ENST00000603632', 'ENST00000605760'}  # None, or a set of gene names to exclude from the analysis (in ENSGs)

out_dir = os.path.join(data_dir, "geuvadis_analysis_total")

In [4]:
if not os.path.isfile(adata_combined_path_reference_genome):
    vk.utils.download_box_url(box_links_dict["geuvadis_adata_reference_genome"], output_file_name=adata_combined_path_reference_genome)
if not os.path.isfile(geuvadis_genotype_true_adata):
    vk.utils.download_box_url(box_links_dict["geuvadis_adata_genotype"], output_file_name=geuvadis_genotype_true_adata)
if not os.path.isfile(adata_combined_path_vcrs_final):
    vk.utils.download_box_url(box_links_dict[f"geuvadis_adata_vcrs_w37_k41"], output_file_name=adata_combined_path_vcrs_final)

adata_reference = ad.read_h5ad(adata_combined_path_reference_genome)
adata_genotype = ad.read_h5ad(geuvadis_genotype_true_adata)
adata_vcrs = ad.read_h5ad(adata_combined_path_vcrs_final)

os.makedirs(out_dir, exist_ok=True)