In [None]:
### Import Libraries.

import os
import subprocess
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from sklearn.mixture import GaussianMixture

In [None]:
### Load Data.

os.chdir("/folder/")
adata =  ad.read_h5ad("adata.h5ad")

In [None]:
### Subsetting  Spinal Cord.

adata_sc = adata[adata.obs["Region"] == "Spinal_Cord"].copy()

adata_sc.X = adata_sc.layers['counts_RNA'].copy()
adata_sc.X.max()
sc.pp.normalize_total(adata_sc, target_sum = 1e4)
adata_sc.layers["data"] = adata_sc.X.copy()
sc.pp.log1p(adata_sc)
adata_sc.layers["log1p_normalized"] = adata_sc.X.copy()
sc.pp.highly_variable_genes(adata_sc, n_top_genes = 5000, flavor = 'seurat')
sc.pp.scale(adata_sc, max_value = 10)
adata_sc.layers["scale.data"] = adata_sc.X.copy()

In [None]:
### Subsetting  Motor Cortex.

adata_mc = adata[adata.obs["Region"] == "Brain"].copy()

adata_mc.X = adata_mc.layers['counts_RNA'].copy()
adata_mc.X.max()
sc.pp.normalize_total(adata_mc, target_sum = 1e4)
adata_mc.layers["data"] = adata_mc.X.copy()
sc.pp.log1p(adata_mc)
adata_mc.layers["log1p_normalized"] = adata_mc.X.copy()
sc.pp.highly_variable_genes(adata_mc, n_top_genes = 5000, flavor = 'seurat')
sc.pp.scale(adata_mc, max_value = 10)
adata_mc.layers["scale.data"] = adata_mc.X.copy()

In [None]:
### GMM.

markers = [
    "CD9", "FTH1", "CNP", "CRYAB", "FTL", "MAG", "MARCKSL1", "NKX6-2", "RNASE1", "APOD"
]

missing = [m for m in markers if m not in adata_sc.var_names]
if missing:
    raise ValueError(f"Markers not found in adata.var_names: {missing}")

adata_sc.obs['marker_avg'] = adata_sc[:, markers].X.mean(axis = 1)

if hasattr(adata_sc.obs['marker_avg'], "A1"):
    adata_sc.obs['marker_avg'] = adata_sc.obs['marker_avg'].A1

control_mask = adata_sc.obs['Status'] == 'Control'
control_values = adata_sc.obs.loc[control_mask, 'marker_avg'].values.reshape(-1, 1)

gmm = GaussianMixture(n_components = 2, random_state = 42)
gmm.fit(control_values)

means = gmm.means_.flatten()
high_component = np.argmax(means)

adata_sc.obs['GMM_component'] = gmm.predict(adata_sc.obs['marker_avg'].values.reshape(-1, 1))

adata_mc.obs['Activation_State'] = np.where(adata_mc.obs['GMM_component'] == high_component, 'Reactive_Oligos', 'Homeostatic_Oligos')

In [None]:
### Basic UMAP.

sc.settings.set_figure_params(dpi = 200, figsize = (10, 10))

sc.pl.scatter(
    adata,
    basis = 'umap', 
    color = 'Activation_State',
    size = 10#,
    #legend_loc = 'on data'
)

In [None]:
### Plot of Activation States.

adata.obs['Region_Annotation'] = adata.obs['Region'].astype(str) + "_" + adata.obs['Annotation'].astype(str)

df = adata.obs.copy()
df = df[df["Enrichment"] == "No"].copy()
df = df[~df["Status"].isin(["sALS_FTD", "fALS"])].copy()

df_summary = (
        df.groupby(["Sample_ID", "Region_Annotation", "Activation_State", "Status"])
      .size()
      .reset_index(name = "count")
)

df_summary["total"] = df_summary.groupby(["Sample_ID", "Region_Annotation"])["count"].transform("sum")
df_summary["prop"] = df_summary["count"] / df_summary["total"]
df_summary = df_summary[df_summary["count"] > 0].copy()

print(df_summary.head())

status_colors = {
    "Control": "#ADB17DFF",
    "C9ALS": "#B1746FFF",
    "sALS": "#5B8FA8FF",
    "fALS": "#6F6DA8FF",
    "sALS_FTD": "#A163A3FF"
}

g = sns.FacetGrid(
    df_summary,
    col = "Region_Annotation", 
    sharey = True,
    height = 4,
    aspect = 1.2
)

g.map_dataframe(
    sns.boxplot,
    x = "Activation_State", y = "prop",
    hue = "Status",
    dodge = True, width = 0.6,
    showcaps = True, fliersize = 0,
    boxprops = dict(alpha = 0.3),
    whiskerprops = dict(alpha=0.6),
    linewidth = 1,
    palette = status_colors
)

g.map_dataframe(
    sns.stripplot,
    x = "Activation_State", y = "prop",
    hue = "Status",
    dodge = True, jitter = 0.15,
    size = 3, alpha = 0.8,
    marker = "o", edgecolor = "black", linewidth = 0.6,
    palette = status_colors,
    legend = False
)

g.set_axis_labels("Activation_State", "Proportion")
for ax in g.axes.flat:
    ax.tick_params(axis = "x", rotation = 45)

handles, labels = g.axes[0][0].get_legend_handles_labels()
g.fig.legend(handles, labels, title = "Status", loc = "upper center", ncol = 3)
g.fig.subplots_adjust(top = 0.8)

g.fig.suptitle("Proportion of Cells per Superclusters, Status, and Region_Annotation", fontsize = 14)
plt.show()

In [None]:
### Overlap Between Annotation Methods.

method1 = adata.obs["Manual_Annotation"]

method2 = adata.obs["GMM_Annotation"]

overlap_table = pd.crosstab(method1, method2)
print(overlap_table)

matching_cells = (method1 == method2).sum()
total_cells = len(adata)
proportion_matching = matching_cells / total_cells
print(f"Proportion of cells with matching labels: {proportion_matching:.2f}")

In [None]:
### Write Adata File.

os.chdir("/folder/")
adata.write("adata_new.h5ad")