In [None]:
# try:
#     import varseek as vk
# except ImportError:
#     print("varseek not found, installing...")
#     !pip install -U -q varseek
# try:
#     import RLSRWP_2025
# except ImportError:
#     print("RLSRWP_2025 not found, installing...")
#     !pip install -q git+https://github.com/pachterlab/RLSRWP_2025.git

In [None]:
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import seaborn as sns
import anndata as ad

import varseek as vk
from RLSRWP_2025.constants import box_links_dict

RLSRWP_2025_dir = os.path.dirname(os.path.abspath(""))  # if this notebook resides in RLSRWP_2025/notebooks/0_data_download.ipynb, then this retrieves RLSRWP_2025
data_dir = os.path.join(RLSRWP_2025_dir, "data")

In [None]:
# #!!! dummy data to be erased

# # Define shared dimensions
# n_obs = 5
# n_vars = 50

# # Create shared obs and var
# obs = pd.DataFrame(index=[f"cell{i}" for i in range(n_obs)])
# variant_var = pd.DataFrame(index=[f"vcrs{j}" for j in range(n_vars)])
# variant_var["variant_header"] = variant_var.index
# gene_var = pd.DataFrame(index=[f"gene{j}" for j in range(n_vars)])

# # Generate random matrices
# X_vcrs = np.random.randint(0, 101, size=(n_obs, n_vars))
# X_reference = np.random.randint(0, 101, size=(n_obs, n_vars))
# X_genotype = np.random.choice([0, 1, 2], size=(n_obs, n_vars))

# # Create the AnnData objects
# adata_vcrs = ad.AnnData(X=X_vcrs, obs=obs.copy(), var=variant_var.copy())
# adata_reference = ad.AnnData(X=X_reference, obs=obs.copy(), var=gene_var.copy())
# adata_genotype = ad.AnnData(X=X_genotype, obs=obs.copy(), var=variant_var.copy())

In [None]:
sequencing_data_out_base = os.path.join(data_dir, "geuvadis_data_base")
geuvadis_reference_files_dir = os.path.join(data_dir, "reference", "geuvadis")
geuvadis_genotype_true_adata = os.path.join(geuvadis_reference_files_dir, "genotypes_adata_true.h5ad")
reference_genome_t2g = os.path.join(data_dir, "reference", "ensembl_grch37_release113", "t2g.txt")

adata_combined_path_reference_genome = os.path.join(sequencing_data_out_base, "adata_reference_genome_combined.h5ad")

w_and_k_list_of_dicts = [
    {"w": 27, "k": 31},
    {"w": 37, "k": 41},
    {"w": 47, "k": 51},
]

sample_index = 1  # sample to focus on - lowest is 1 (index in adata)
min_counts = 2

out_dir = os.path.join(data_dir, "geuvadis_analysis")

In [None]:
if not os.path.isfile(adata_combined_path_reference_genome):
    vk.utils.download_box_url(box_links_dict["geuvadis_adata_reference_genome"], output_file_name=adata_combined_path_reference_genome)
if not os.path.isfile(geuvadis_genotype_true_adata):
    vk.utils.download_box_url(box_links_dict["geuvadis_adata_genotype"], output_file_name=geuvadis_genotype_true_adata)

adata_reference = ad.read_h5ad(adata_combined_path_reference_genome)
adata_genotype = ad.read_h5ad(geuvadis_genotype_true_adata)

os.makedirs(out_dir, exist_ok=True)

## Plot Venn diagrams for my k-series to determine the best variant matrix to use

In [None]:
# TODO: I currently ran vk ref with merge_identical=False, but in the future, I would like to do this with True, resolve merged headers with reference genome alignment, and then filter out whichever merged headers remain
def plot_variant_overlap(adata_vcrs_path, adata_genotype_path, sample_index=1, min_counts=None, out_path=None):
    if isinstance(adata_vcrs_path, str):
        adata_vcrs = ad.read_h5ad(adata_vcrs_path)
    elif isinstance(adata_vcrs_path, ad.AnnData):
        adata_vcrs = adata_vcrs_path.copy()
    else:
        raise ValueError("adata_vcrs_path must be a string or an AnnData object.")
    if "vcrs_header" in adata_vcrs.var.columns:
        adata_vcrs.var["variant_header"] = adata_vcrs.var["vcrs_header"].copy()

    if isinstance(adata_genotype_path, str):
        adata_genotype = ad.read_h5ad(adata_genotype_path)
    elif isinstance(adata_genotype_path, ad.AnnData):
        adata_genotype = adata_genotype_path.copy()
    else:
        raise ValueError("adata_genotype_path must be a string or an AnnData object.")
    
    # Select only the sample_index'th sample
    adata_vcrs = adata_vcrs[:sample_index, :]
    adata_genotype = adata_genotype[:sample_index, :]

    if min_counts is not None:
        adata_vcrs.X = adata_vcrs.X.multiply(adata_vcrs.X >= min_counts)

    # Filter adata_vcrs: keep only columns where the value in the first row is not 0
    nonzero_vcrs_cols = adata_vcrs.X[0] != 0
    adata_vcrs = adata_vcrs[:, nonzero_vcrs_cols]
    variants_predicted = set(adata_vcrs.var["variant_header"])

    # Filter adata_genotype: keep only columns where the value in the first row is not 0
    nonzero_genotype_cols = adata_genotype.X[0] != 0
    adata_genotype = adata_genotype[:, nonzero_genotype_cols]
    variants_true = set(adata_genotype.var["variant_header"])

    plt.figure(figsize=(4, 4))
    venn2([variants_predicted, variants_true], set_labels=('Predicted', 'True'))
    plt.title("Variant Overlap")
    if out_path:
        plt.savefig(out_path, dpi=300)
    plt.show()

In [None]:
for w_and_k_dict in w_and_k_list_of_dicts:
    w, k = w_and_k_dict["w"], w_and_k_dict["k"]
    adata_combined_path_vcrs = os.path.join(sequencing_data_out_base, f"adata_vcrs_combined_w{w}_k{k}.h5ad")
    if not os.path.isfile(adata_combined_path_vcrs):
        vk.utils.download_box_url(box_links_dict[f"geuvadis_adata_vcrs_w{w}_k{k}"], output_file_name=adata_combined_path_vcrs)
    
    venn_out_path = os.path.join(out_dir, f"variant_overlap_sample_{sample_index}_w{w}_k{k}.png")
    plot_variant_overlap(adata_combined_path_vcrs, adata_genotype, sample_index=1, out_path=venn_out_path)

## Based on this, I will choose k=51

In [None]:
w, k = 47, 51
adata_combined_path_vcrs_final = os.path.join(sequencing_data_out_base, f"adata_vcrs_combined_final_w{w}_k{k}.h5ad")
if not os.path.isfile(adata_combined_path_vcrs_final):
    vk.utils.download_box_url(box_links_dict[f"geuvadis_adata_vcrs_w{w}_k{k}"], output_file_name=adata_combined_path_vcrs_final)

adata_vcrs = ad.read_h5ad(adata_combined_path_vcrs_final)
if min_counts is not None:
    adata_vcrs.X = adata_vcrs.X.multiply(adata_vcrs.X >= min_counts)

adata_vcrs.var.rename(columns={"vcrs_header": "variant_header"}, inplace=True)
adata_vcrs.var[["transcript_ID", "variant"]] = adata_vcrs.var["variant_header"].str.split(":", expand=True)
t2g_dict = vk.utils.make_t2g_dict(reference_genome_t2g)
t2g_dict = {key.split(".")[0]: val.split(".")[0] for key, val in t2g_dict.items()}  # strip off the version number
adata_vcrs.var["gene"] = adata_vcrs.var["transcript_ID"].map(t2g_dict)

In [None]:
# Select only the sample_index'th sample
adata_vcrs = adata_vcrs[:sample_index, :]
adata_reference = adata_reference[:sample_index, :]
adata_genotype = adata_genotype[:sample_index, :]

# Filter adata_vcrs: keep only columns where the value in the sample_index'th row is not 0
nonzero_genotype_cols = adata_genotype.X[0] != 0
adata_genotype = adata_genotype[:, nonzero_genotype_cols]
variants_true = set(adata_genotype.var["variant_header"])

# filter adata_vcrs to only keep those present in variants_true
adata_vcrs = adata_vcrs[:, adata_vcrs.var['variant_header'].isin(variants_true)]

# add 1D count "matrix" as a column to adata.var
adata_vcrs.var["variant_counts"] = adata_vcrs.X[0]
adata_reference.var["gene_counts"] = adata_reference.X[0]
adata_genotype.var["genotype"] = adata_genotype.X[0]

# Merge adata_reference.var into adata_vcrs by gene
adata_vcrs.var = adata_vcrs.var.merge(
    adata_reference.var[["gene_counts"]],
    left_on="gene",
    right_index=True,
    how="left",
)

# Merge genotype into var by variant header
adata_vcrs.var = adata_vcrs.var.merge(
    adata_genotype.var[["variant_header", "genotype"]],
    on="variant_header",
    how="left",
)

## Scatterplot time

In [None]:
scatterplot_df = adata_vcrs.var[["variant_counts", "gene_counts", "genotype"]].copy()
scatterplot_df["genotype"] = scatterplot_df["genotype"].astype("category")
scatterplot_df["genotype"] = scatterplot_df["genotype"].cat.rename_categories({0: "homozygous_ref", 1: "heterozygous", 2: "homozygous_alt"})

plt.figure(figsize=(6, 5))
sns.scatterplot(
    data=scatterplot_df,
    x="variant_counts",
    y="gene_counts",
    hue="genotype",
    palette="Set2",
    s=80  # adjust marker size
)

plt.title("Variant vs Gene Counts by Genotype")
plt.xlabel("Variant Counts")
plt.ylabel("Gene Counts")
plt.legend(title="Genotype")
plt.tight_layout()
out_file = os.path.join(out_dir, f"variant_vs_gene_counts_scatterplot_sample_{sample_index}_w{w}_k{k}.png")
plt.savefig(out_file, dpi=300)
plt.show()