In [None]:
### Import Libraries.

import os
import anndata as ad
import scanpy as sc
import scanpy.external as sce
import harmonypy as hm
import phate
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
### Load Filtered Reference AnnData.

os.chdir("/folder/")
adata = ad.read_h5ad("adata.h5ad")

In [None]:
### Load Individual .loom Files and Check Barcodes.

loom_paths = [
    "NIN__9ad119__C1-10X-27072020.loom",  "NIN__608c65__C2-10X-27072020.loom",
    "NIN__04cf5c__C3-10X-09092020.loom",  "NIN__97179c__C4-10X-09092020.loom",
    "NIN__463fa0__C5-10X-10092020.loom",  "NIN__039d7c__C6-10X-10092020.loom",
    "NIN__51fec3__C7-10X-10092020.loom",  "NIN__943d10__C8-10X-17092020.loom",
    "NIN__6745a5__C9-10X-17092020.loom",  "NIN__ffcec5__C10-10X-06102020.loom",
    "NIN__90d332__C11-10X-06102020.loom", "NIN__d855ee__C12-10X-08102020.loom",
    "NIN__a94c03__C13-10X-08102020.loom", "NIN__38661b__S1-10X-09092020.loom",
    "NIN__977c8e__S2-10X-17092020.loom",  "NIN__6349dd__S3-10X-17092020.loom",
    "NIN__027efd__S4-10X-06102020.loom",  "NIN__7313eb__S5-10X-06102020.loom",
    "NIN__63ba9e__S6-10X-08102020.loom"
]

lane_names = [f"C{i}" for i in range(1, 14)] + [f"S{i}" for i in range(1, 7)]
adata_list = [ad.read_loom(f) for f in loom_paths]

for a, name in zip(adata_list, lane_names):
    a.obs["Lane"] = name
    a.obs["Barcode"] = a.obs_names.str.split(":").str[-1].str.rstrip("x")
    dups = a.obs["Barcode"].duplicated().sum()
    print(f"{name}: {'No duplicates' if dups == 0 else f'{dups} duplicate barcodes found.'}")

In [None]:
### Integrate Metadata.

sample_info = pd.read_excel("Sample_Info_Simple.xlsx")
map_info = pd.read_excel("Map_Samples_Georgopoulou.xlsx")

map_info["Sample_ID"] = map_info["Initial_Sample_ID"]
sample_info["Sample_ID"] = "Georgopoulou_" + sample_info["Sample_ID"].astype(str)

merged_meta = pd.merge(sample_info, map_info, on="Sample_ID", how="inner")
merged_meta["Sample_ID"] = merged_meta["Rename"]
merged_meta = merged_meta.drop(columns=["Region", "Rename", "Initial_Sample_ID"], errors="ignore")

adata.obs["Cell_ID"] = adata.obs_names
metadata = adata.obs.reset_index()

merged_full = pd.merge(merged_meta, metadata, on="Sample_ID", how="left", validate="1:m")
merged_full = merged_full[merged_full["Lane"] != "C1"]  ### Test Lane

In [None]:
### Subset AnnData to Only Include Overlapping Cells.

adata_sub = adata[adata.obs_names.isin(merged_full["Cell_ID"])].copy()
adata_sub.obs = merged_full.set_index("Cell_ID").loc[adata_sub.obs_names]
adata_sub.obs["Dataset"] = adata_sub.obs["Dataset"].cat.remove_unused_categories()
adata_sub.obs["Barcode"] = adata_sub.obs["Barcode"].str.split("-", n=1).str[0]

print(f"Subset AnnData: {adata_sub.n_obs:,} cells across "
      f"{adata_sub.obs['Sample_ID'].nunique()} samples retained.")

In [None]:
### Merge loom Datasets Corresponding to Filtered Cells.

def remove_duplicated_gene_names(adata_obj: ad.AnnData) -> ad.AnnData:
    if adata_obj.var_names.duplicated().any():
        adata_obj = adata_obj[:, ~adata_obj.var_names.duplicated()]
        print(f"Removed duplicated genes → {adata_obj.n_vars} unique remain.")
    return adata_obj

processed = []
filtered_barcodes = set(adata_sub.obs["Barcode"])

for ad_obj in adata_list:
    ad_obj = remove_duplicated_gene_names(ad_obj)
    ad_obj.obs["Barcode"] = ad_obj.obs["Barcode"].astype(str)
    ad_obj = ad_obj[ad_obj.obs["Barcode"].isin(filtered_barcodes)].copy()
    processed.append(ad_obj)

adata_merged = processed[0].concatenate(*processed[1:], join="outer")
adata_merged.layers["counts"] = adata_merged.X
print(f"Merged filtered AnnData: {adata_merged.n_obs:,} cells × {adata_merged.n_vars:,} genes.")

In [None]:
### Downstream Analysis.

sc.pp.normalize_total(adata_merged, target_sum = 1e4)
adata_merged.layers["data"] = adata_merged.X.copy()
sc.pp.log1p(adata_merged)
adata_merged.layers["log1p_normalized"] = adata_merged.X.copy()
sc.pp.highly_variable_genes(adata_merged, n_top_genes = 5000, flavor = 'seurat')
sc.pp.scale(adata_merged, max_value = 10)
adata_merged.layers["scale.data"] = adata_merged.X.copy()
sc.tl.pca(adata_merged, n_comps = 50)
adata_merged.obs['Sample_ID_Dataset'] = (
    adata_merged.obs['Sample_ID'].astype(str) + "_" + adata_merged.obs['Dataset'].astype(str)
)
sce.pp.harmony_integrate(adata_merged, key='Sample_ID', max_iter_harmony = 100,
                         nclust = 40, max_iter_kmeans = 20)

sc.pp.neighbors(adata_merged, use_rep = 'X_pca_harmony', n_neighbors = 20)
sc.tl.umap(adata_merged)
sc.tl.leiden(adata_merged, resolution = 0.1, key_added = "leiden_0.1")
print("UMAP embedding complete.")

In [None]:
### PHATE Embedding

phate_op = phate.PHATE(n_components = 2, knn = 10, t = 20, verbose = False)
adata_merged.obsm["X_phate"] = phate_op.fit_transform(adata_merged.obsm["X_pca_harmony"])
print("PHATE embedding computed.")

In [None]:
### Plot

sc.set_figure_params(dpi = 800, figsize=(5,5))
sc.pl.umap(adata_merged, color = "Sample_ID", title = "UMAP")

plt.figure()
plt.scatter(adata_merged.obsm["X_phate"][:,0], adata_merged.obsm["X_phate"][:,1],
            c = adata_merged.obs["Sample_ID"].astype("category").cat.codes,
            s = 2, cmap = "tab20")
plt.title("PHATE embedding")
plt.axis("off")

In [None]:
### Save Anndata.

os.chdir("/folder/")
adata_merged.write_h5ad("adata_RNA_vel.h5ad")

In [None]:
### Subset to Keep Common Features.

common_genes = adata_merged.var_names.intersection(adata.var_names)
adata_merged = adata_merged[:, adata_merged.var_names.isin(common_genes)].copy()
adata = adata[:, adata.var_names.isin(common_genes)].copy()

print(f"Retained {len(common_genes):,} shared features between merged_adata and adata.")

In [None]:
### Save Anndata.

os.chdir("/folder/")
adata_merged.write_h5ad("adata_RNA_vel_sub_features.h5ad")

In [None]:
### Plot Spliced/Unspliced Ratio.

category_order = [
    "Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4", "Cluster_5",
    "Cluster_6", "Cluster_7", "Cluster_8", "Cluster_9", "Cluster_10"

]

spliced = adata_merged.layers["spliced"]
unspliced = adata_merged.layers["unspliced"]

spliced_sum = np.array(spliced.sum(axis = 1)).flatten()
unspliced_sum = np.array(unspliced.sum(axis = 1)).flatten()

ratio = unspliced_sum / (spliced_sum + 1e-6)

df = pd.DataFrame({
    "unspliced_spliced_ratio": ratio,
    "Cluster_Column": adata_merged.obs["Cluster_Column"].astype(str)
}, index = adata_merged.obs_names)

df["Cluster_Column"] = pd.Categorical(df["Cluster_Column"], categories = category_order, ordered = True)

df = df.dropna(subset = ["Cluster_Column"])

df["Region"] = adata_merged.obs.loc[df.index, "Region"]

custom_palette = {
    "Brain": "#282A3A",
    "Spinal_Cord": "#9496A1"
}

sns.set_style("white")
plt.figure(figsize = (14, 6), facecolor = 'white') 
sns.boxplot(
    data = df,
    x = "Cluster_Column",
    y = "unspliced_spliced_ratio",
    hue = "Region",
    order = category_order,
    palette = custom_palette
)

plt.xticks(rotation = 45)
plt.title("Unspliced / Spliced Ratio per Cluster_Column split by Region")
plt.legend(title = "Region", bbox_to_anchor = (1.05, 1), loc = 'upper left')
plt.tight_layout()
plt.savefig("unspliced_spliced_ratio_boxplot.png", dpi = 800)
plt.show()