## Prep to run this notebook:
- python scripts/run_vk_ref_geuvadis.py
- python scripts/run_vk_count_ccle.py

In [1]:
# try:
#     import varseek as vk
# except ImportError:
#     print("varseek not found, installing...")
#     !pip install -U -q varseek
# try:
#     import RLSRWP_2025
# except ImportError:
#     print("RLSRWP_2025 not found, installing...")
#     !pip install -q git+https://github.com/pachterlab/RLSRWP_2025.git

In [None]:
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import shutil
import seaborn as sns
import gget
import re
from collections import Counter
import anndata as ad
from scipy import sparse
import scanpy as sc
from sklearn.metrics import silhouette_score
from upsetplot import UpSet, from_contents

import varseek as vk
from RLSRWP_2025.constants import box_links_dict

RLSRWP_2025_dir = os.path.dirname(os.path.abspath(""))  # if this notebook resides in RLSRWP_2025/notebooks/0_data_download.ipynb, then this retrieves RLSRWP_2025
data_dir = os.path.join(RLSRWP_2025_dir, "data")

In [None]:
sequencing_data_out_base = os.path.join(data_dir, "ccle_data_base")
adata_combined_path_variants = os.path.join(sequencing_data_out_base, "adata_vcrs_glioblastoma.h5ad")
# adata_combined_path_reference_genome = os.path.join(sequencing_data_out_base, "adata_vcrs_combined_w37_k41.h5ad")

reference_dir = os.path.join(RLSRWP_2025_dir, "data", "reference")
t2g_file = os.path.join(reference_dir, "ensembl_grch37_release113", "t2g.txt")

cosmic_csv = os.path.join(reference_dir, "cosmic", "CancerMutationCensus_AllData_Tsv_v101_GRCh37", "CancerMutationCensus_AllData_v101_GRCh37_mutation_workflow.csv")

out_dir = os.path.join(data_dir, "ccle_analysis")
cbioportal_mutated_genes_path = os.path.join(out_dir, "Mutated_Genes.txt")
single_cell_gbm_variants_path = os.path.join(data_dir, 'vk_count_out_fig1', 'analysis', 'single_cell_analysis', 'genes_grouped_df_astrocytes.csv')
single_cell_gbm_genes_path = os.path.join(data_dir, 'vk_count_out_fig1', 'analysis', 'single_cell_analysis', 'genes_grouped_df_astrocytes.csv')

min_counts = 2
min_samples_per_variant = 3
n_top_variants = 2000

# for all of these, do None to get everything
fcv_threshold_ccle = 0.1
# fcv_threshold_sc = 0.1
FCV_neoplastic_astrocyte_to_FCV_healthy_astrocyte_ratio_1p_threshold_sc = 2
max_FCV_neoplastic_astrocyte_to_FCV_healthy_astrocyte_ratio_1p_threshold_sc = 2
number_cbio_samples = 10

out_dir = os.path.join(data_dir, "ccle_analysis_total")

In [None]:
if not os.path.isfile(adata_combined_path_variants):
    vk.utils.download_box_url(box_links_dict["ccle_glioblastoma_adata_vcrs"], output_file_name=adata_combined_path_variants)
if not os.path.isfile(cosmic_csv):
    gget.cosmic(None, grch_version=37, cosmic_version=101, out=os.path.dirname(cosmic_csv), cosmic_project="cancer", download_cosmic=True, gget_mutate=True, keep_genome_info=True, remove_duplicates=True)
if not os.path.isfile(cbioportal_mutated_genes_path):
    raise FileExistsError(f"{cbioportal_mutated_genes_path} does not exist. Download by visiting https://www.cbioportal.org/study/summary?id=gbm_tcga_pan_can_atlas_2018 → click 'mutated genes'")
if not os.path.isfile(single_cell_gbm_variants_path):
    raise FileExistsError(f"{single_cell_gbm_variants_path} does not exist. Please run Fig1_single_cell_analysis.ipynb to generate this file.")
if not os.path.isfile(single_cell_gbm_genes_path):
    raise FileExistsError(f"{single_cell_gbm_genes_path} does not exist. Please run Fig1_single_cell_analysis.ipynb to generate this file.")

adata = ad.read_h5ad(adata_combined_path_variants)
cosmic_df = pd.read_csv(cosmic_csv)
cbioportal_mutated_genes_df = pd.read_csv(cbioportal_mutated_genes_path, sep="\t")
cbioportal_mutated_genes_df = cbioportal_mutated_genes_df.sort_values(by="#", ascending=False)
single_cell_gbm_variants_df = pd.read_csv(single_cell_gbm_variants_path)
single_cell_gbm_genes_df = pd.read_csv(single_cell_gbm_genes_path)


os.makedirs(out_dir, exist_ok=True)

In [None]:
# Filter to keep only rows where "#" is >= number_cbio_samples
cbioportal_mutated_genes_df_top = cbioportal_mutated_genes_df[cbioportal_mutated_genes_df["#"] >= number_cbio_samples] if number_cbio_samples is not None else cbioportal_mutated_genes_df.copy()

# only keep columns of adata where adata.var['gene_symbol'] is in cbioportal_mutated_genes_df['Gene'].unique()
cbio_gbm_genes = set(cbioportal_mutated_genes_df_top['Gene'].unique())

In [32]:
# adata = ad.read_h5ad("/home/jmrich/Desktop/RLSRWP_2025/data/ccle_data_base/adata_vcrs_50.h5ad")  #!!! erase

In [None]:
adata = adata[:, np.array((adata.X != 0).sum(axis=0)).flatten() > 0]  # Identify columns (variants) with non-zero counts across samples
adata = adata[:, ~adata.var_names.str.contains(";")]  # Identify columns (variants) with semicolon in name
adata.var.index.name = "variant_name"
adata.var = adata.var.reset_index(drop=False)
adata.var[["seq_ID", "mutation_cdna"]] = adata.var["variant_name"].str.split(":", expand=True)
adata.var = adata.var.merge(cosmic_df[["seq_ID", "gene_name"]].drop_duplicates("seq_ID"), on="seq_ID", how="left").rename(columns={"gene_name": "gene_symbol"})

if min_counts is not None:
    adata.X = adata.X.multiply(adata.X >= min_counts)

adata.var["number_obs"] = np.array((adata.X != 0).sum(axis=0)).flatten()

adata.var["FCV"] = adata.var["number_obs"] / adata.shape[0]  # fraction of samples with variant - Fraction of cSamples with Variant (FCV)

adata.var['variant_name_with_gene_name'] = (
    adata.var["gene_symbol"] + "(" +
    adata.var["seq_ID"] + "):" +
    adata.var["mutation_cdna"]
)

adata.obs.index = adata.obs.index.astype(str)  # avoid Don’t call _normalize_index with non-categorical/string names
adata.var.index = adata.var.index.astype(str)  # avoid Don’t call _normalize_index with non-categorical/string names

In [35]:
adata.var.head()

Unnamed: 0,variant_id,seq_ID,mutation_cdna,gene_symbol,number_obs,FCV,variant_name_with_gene_name
0,ENST00000396153:c.1622C>T,ENST00000396153,c.1622C>T,FAF1,1,0.02,FAF1(ENST00000396153):c.1622C>T
1,ENST00000396153:c.1592C>T,ENST00000396153,c.1592C>T,FAF1,0,0.0,FAF1(ENST00000396153):c.1592C>T
2,ENST00000396153:c.2261A>G,ENST00000396153,c.2261A>G,FAF1,2,0.04,FAF1(ENST00000396153):c.2261A>G
3,ENST00000396153:c.1586C>T,ENST00000396153,c.1586C>T,FAF1,0,0.0,FAF1(ENST00000396153):c.1586C>T
4,ENST00000396153:c.1959G>A,ENST00000396153,c.1959G>A,FAF1,2,0.04,FAF1(ENST00000396153):c.1959G>A


In [None]:
ccle_gbm_variants = set(adata.var.loc[adata.var["FCV"] >= fcv_threshold_ccle, "variant_name"]) if fcv_threshold_ccle is not None else set(adata.var["variant_name"])
ccle_gbm_genes = set(adata.var.loc[adata.var["FCV"] >= fcv_threshold_ccle, "gene_symbol"]) if fcv_threshold_ccle is not None else set(adata.var["gene_symbol"])

sc_gbm_variants = set(single_cell_gbm_variants_df.loc[single_cell_gbm_variants_df["FCV_neoplastic_astrocyte_to_FCV_healthy_astrocyte_ratio_1p"] >= FCV_neoplastic_astrocyte_to_FCV_healthy_astrocyte_ratio_1p_threshold_sc, "variant_name"]) if FCV_neoplastic_astrocyte_to_FCV_healthy_astrocyte_ratio_1p_threshold_sc is not None else set(single_cell_gbm_variants_df["variant_name"])
sc_gbm_genes = set(single_cell_gbm_variants_df.loc[single_cell_gbm_variants_df["max_FCV_neoplastic_astrocyte_to_FCV_healthy_astrocyte_ratio_1p"] >= max_FCV_neoplastic_astrocyte_to_FCV_healthy_astrocyte_ratio_1p_threshold_sc, "gene_symbol"]) if max_FCV_neoplastic_astrocyte_to_FCV_healthy_astrocyte_ratio_1p_threshold_sc is not None else set(single_cell_gbm_variants_df["gene_symbol"])

In [None]:
# variants present in both ccle (in a certain fraction of samples) and sc (with certain neoplastic astrocyte to healthy astrocyte ratio)
upset_dict = {
    "ccle_variants": ccle_gbm_variants,
    "sc_variants": sc_gbm_variants,
}

upset_out_path = os.path.join(out_dir, "upset_variants_detected.pdf")
variants_detected_data_for_upset = from_contents(upset_dict)
ax_dict = UpSet(variants_detected_data_for_upset, subset_size='count', show_counts=True).plot()
plt.savefig(upset_out_path)
plt.close()

In [None]:
# variants present in ccle (in a certain fraction of samples), sc (with certain max neoplastic astrocyte to healthy astrocyte ratio), and cbio (in a certain number of samples)
upset_dict = {
    "ccle_genes": ccle_gbm_variants,
    "sc_genes": sc_gbm_variants,
    "cbio_genes": cbio_gbm_genes,
}

upset_out_path = os.path.join(out_dir, "upset_genes_detected.pdf")
variants_detected_data_for_upset = from_contents(upset_dict)
ax_dict = UpSet(variants_detected_data_for_upset, subset_size='count', show_counts=True).plot()
plt.savefig(upset_out_path)
plt.close()