## Appendix
PCA and clustering (k-means, Leiden, and hierarchical clustering) were performed on scRNA-seq data using BANKSY markers, with two feature selection strategies:  
- using markers alone  
- combining each marker with its top 25 most correlated genes (BANKSY method)  

All preprocessing steps follow those described in the BANKSY paper.  

In [None]:
from pathlib import Path

import sys
import os
sys.path.append(os.path.abspath("../src"))

from utils import load_scRNA_data
from utils import process_related_genes, kmeans_clustering, leiden_clustering, hierarchical_clustering
from plot import plot_pca_variance_ratio, plot_pca_cumulative_variance, plot_umap_from_pca

import pandas as pd

### data

#### Matched Single-Cell RNA Sequencing

In [None]:
data_path = Path("../data/mouse_hypothalamus/singlecell/")
mtx_path = data_path / "GSE113576_matrix.mtx"
barcodes_path = data_path / "GSE113576_barcodes.tsv"
genes_path = data_path / "GSE113576_genes.tsv"
meta_path = data_path / "aau5324_Moffitt_Table-S1.xlsx"

In [None]:
cell_class_dict = {
    'Mature oligodendrocyte': 'OD mature'
}

adata = load_scRNA_data(
    mtx_path, 
    barcodes_path, 
    genes_path, 
    meta_path, 
    cell_class_filter = cell_class_dict
)

#### Marker Genes

differentially expressed genes identified by BANKSY

In [None]:
# all differentially expressed genes
DE_genes = ['Mlc1', 'Dgkk', 'Cbln2', 'Syt4', 'Gad1', 'Plin3', 'Gnrh1', 'Sln', 'Gjc3', 'Mbp', 'Lpar1', 'Trh', 'Ucn3', 'Cck']
# DE_genes_gm: 7
DE_genes_MOD2 = ['Mlc1', 'Dgkk', 'Cbln2', 'Syt4', 'Gad1', 'Plin3', 'Gnrh1', 'Sln', 'Gjc3']
# DE_genes_wm: 8
DE_genes_MOD1 = ['Mbp', 'Lpar1', 'Trh', 'Ucn3', 'Cck']

In [None]:
sc_data = adata.to_df()
sc_DE_MOD2_df = sc_data[DE_genes_MOD2]
sc_DE_MOD1_df = sc_data[DE_genes_MOD1]
sc_DE = pd.concat([sc_DE_MOD2_df, sc_DE_MOD1_df], axis=1)

### BANKSY method: top 25 correlated genes for each marker

In [None]:
top25_corr_genes = process_related_genes(DE_genes, sc_data.T)
print("number of unique correlated genes and DE genes: ", len(top25_corr_genes))

In [None]:
DE_scRNA_data =  sc_data[DE_genes]
DE_corr_scRNA_data = sc_data[top25_corr_genes]
top25_num = len(top25_corr_genes)

### PCA

In [None]:
data = DE_scRNA_data.copy()

In [None]:
plot_pca_variance_ratio(data, n_components=14, title="Explained Variance by PC")

In [None]:
plot_pca_cumulative_variance(data, n_components=14, title="Cumulative Explained Variance")

In [None]:
plot_umap_from_pca(data, n_PCs=5, title="UMAP after PCA", color='mediumvioletred')

In [None]:
data_corr = DE_corr_scRNA_data.copy()

In [None]:
plot_pca_variance_ratio(data_corr, n_components=top25_num, title="Explained Variance by PC")

In [None]:
plot_pca_cumulative_variance(data_corr, n_components=top25_num, title="Cumulative Explained Variance")

In [None]:
plot_umap_from_pca(data_corr, n_PCs=5, title="UMAP after PCA", color='mediumvioletred')

### k-means clustering

In [None]:
OD_k_labels, OD_centroids = kmeans_clustering(DE_scRNA_data, k=2, n_PCs=5, cmap_re=False)

In [None]:
OD_k_labels, OD_centroids = kmeans_clustering(DE_corr_scRNA_data, k=2, n_PCs=5)

### leiden clustering

In [None]:
OD_leiden_labels = leiden_clustering(DE_scRNA_data, k=50, resolution=0.027, n_PCs=5, cmap_re=True)

In [None]:
OD_leiden_labels = leiden_clustering(DE_corr_scRNA_data, k=50, resolution=0.2, n_PCs=5)

### hierarchical clustering


In [None]:
OD_h_labels = hierarchical_clustering(DE_scRNA_data, k=2, n_PCs=5)

In [None]:
OD_h_labels = hierarchical_clustering(DE_corr_scRNA_data, k=2, n_PCs=5)