In [18]:
# ======================================================================
# == import main packages ==
# ======================================================================

import matplotlib.image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
from matplotlib.colors import ListedColormap
from stardist import random_label_cmap, _draw_polygons
import pandas as pd
import numpy as np
import scanpy as sc
from scipy import sparse
import bin2cell as b2c
from stardist.models import StarDist2D
from csbdeep.utils import normalize
import cv2
import os
import pickle
import anndata
import geopandas as gpd
from tifffile import imread, imwrite
from stardist.models import StarDist2D
from shapely.geometry import Polygon, Point
import seaborn as sns
import scrublet as scr
#pip install celltypist
import celltypist
from celltypist import models
import itertools
import anndata as ad
from pandas.api.types import CategoricalDtype

from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import SimpleImputer

In [19]:
# ======================================================================
# == import custom functions ==
# ======================================================================

# from custom_functions import create_palette_matplotlib, up_to_pca, from_pca_to_leiden

def up_to_pca(adata, n_top_genes=3000, from_raw=True):

    print('Top genes: '+str(n_top_genes))
    if from_raw:
        x = adata.raw.to_adata()
    else:
        x = adata.copy()
        sc.pp.normalize_total(x)
        sc.pp.log1p(x)
    if n_top_genes == 0:
        sc.pp.highly_variable_genes(x, min_mean=0.0125, max_mean=3, min_disp=-2)
    else:
        sc.pp.highly_variable_genes(x, n_top_genes=n_top_genes)
    sc.pl.highly_variable_genes(x)
    x.raw = x
    x = x[:, x.var.highly_variable]
    sc.pp.scale(x, max_value = 10)
    sc.tl.pca(x)
    sc.pl.pca_variance_ratio(x, n_pcs=50)
    return x

def from_pca_to_leiden(adata, n_neighbors = 20, n_pcs = 15, resolution = 1, plot_spatial=True):
    
    print('Neighbors: '+str(n_neighbors))
    print('PCA n: '+str(n_pcs))
    print('resolution: '+str(resolution))
    x = adata.copy()
    sc.pp.neighbors(x, n_neighbors = n_neighbors, n_pcs = n_pcs)
    sc.tl.umap(x)
    sc.tl.leiden(x, flavor="igraph", resolution= resolution)
    n_clusters = len(np.unique(x.obs.leiden))
    x.uns["leiden_colors"] = create_palette_matplotlib(len(x.obs.leiden.cat.categories), white=False)
    sc.pl.umap(x, color=["leiden"], legend_loc = 'on data')
    if plot_spatial:
    	sc.pl.spatial(x, color="leiden")
    return x

def create_palette_matplotlib(n, white=True):

    cmap = plt.get_cmap('hsv')  # Puoi cambiare 'hsv' con qualsiasi colormap disponibile in matplotlib
    colors = [cmap(i / n) for i in range(n)]
    if(white): colors = [(1, 1, 1, 1)] + colors
    return colors #ListedColormap(colors)



In [20]:
# ======================================================================
# == load the input ==
# ======================================================================

# adata
adata = sc.read_h5ad("../../../../data/Mouse_Embryo/Mouse_Embryo_adata_final.h5ad")

In [21]:
# ======================================================================
# == processing ==
# ======================================================================

# with True it fails
adata.raw = adata.copy()
adata = up_to_pca(adata, n_top_genes=3000, from_raw=True)
# adata = up_to_pca(adata, n_top_genes=3000, from_raw=False)

# n_top_genes, n_neighbors, n_pcs, resolution potrebbero essere dati come input?

Top genes: 3000


In [22]:
sc.pl.spatial(adata)

In [23]:
# with True fails
adata = from_pca_to_leiden(adata, n_neighbors = 20, n_pcs = 15, resolution = 1, plot_spatial=True)
# adata = from_pca_to_leiden(adata, n_neighbors = 20, n_pcs = 15, resolution = 1, plot_spatial=False)

Neighbors: 20
PCA n: 15
resolution: 1


In [None]:
# adata.write_h5ad(results_folder + sample_name+'_adata_final.h5ad')   # overwrite

In [24]:
# find n nearest neighbours, with max threshold

n = 20 # da dare in input il numero di neighbours considerati
max_dist = 50 # dare in input max distance (non so in che ordine sia)
coordinates = adata.obs[["array_row", "array_col"]]
neighbors = NearestNeighbors(n_neighbors=n+1, metric='euclidean')
neighbors.fit(coordinates)
distances, indices = neighbors.kneighbors(coordinates)


# Exclude the first column (which is the distance to the point itself)
distances = distances[:, 1:]
indices = indices[:, 1:]

# For each cell, keep only neighbor indices where distance < max_dist
filtered_neighbors = []
for dist_row, idx_row in zip(distances, indices):
    filtered = [idx for d, idx in zip(dist_row, idx_row) if d < max_dist]
    filtered_neighbors.append(filtered)

neighbors_df = pd.DataFrame(distances, columns=[f'distance_{i+1}' for i in range(n)])
neighbors_df['neighbor_indices'] = filtered_neighbors
neighbors_df['n_neighbors_within_max_dist'] = neighbors_df['neighbor_indices'].apply(len)

adata.obs['n_neighbors_within_max_dist'] = neighbors_df['n_neighbors_within_max_dist']



In [25]:
# CELLTYPES distribution among nerighbours

neighbor_cell_types_counts = []
cell_types = adata.obs.cell_types.unique()
col_index = adata.obs.columns.get_loc('cell_types')

for i in range(neighbors_df.shape[0]):
    idx = np.array(neighbors_df.loc[i, 'neighbor_indices'])
    neighbor_cell_types = adata.obs.iloc[idx, col_index]
    type_counts = neighbor_cell_types.value_counts()

    count_dict = {cell_type: int(type_counts.get(cell_type, 0)) for cell_type in cell_types}
    neighbor_cell_types_counts.append(count_dict)

neighbor_cell_types_df = pd.DataFrame(neighbor_cell_types_counts, index=adata.obs.index).T # cell types x cells
neighbor_cell_types_df = neighbor_cell_types_df.div(neighbor_cell_types_df.sum(axis=0), axis=1)

In [26]:
# KMEANS ON FRACTION OF CELLTYPES IN NEIGHBOURS

k = 20  # number of niches, to be given in input

X = neighbor_cell_types_df.T  # shape: cells x cell types
kmeans = KMeans(n_clusters=k, random_state=42)


In [27]:
X

Unnamed: 0,leukocyte,fibroblast of cardiac tissue,basal cell of epidermis,hepatocyte,fibroblast of lung,type B pancreatic cell,bladder cell,mesenchymal stem cell of adipose tissue,mesenchymal stem cell,smooth muscle cell,...,plasma cell,pancreatic acinar cell,immature B cell,hematopoietic precursor cell,enterocyte of epithelium of large intestine,"CD8-positive, alpha-beta T cell",professional antigen presenting cell,proerythroblast,monocyte,precursor B cell
46,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
235,0.0,0.0,0.5,0.0,0.0,0.0,0.000000,0.0,0.0,0.500000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
361,,,,,,,,,,,...,,,,,,,,,,
701,,,,,,,,,,,...,,,,,,,,,,
815,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272956,,,,,,,,,,,...,,,,,,,,,,
273253,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
274001,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.5,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278145,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
if isinstance(X, pd.DataFrame):
    print(f"NaNs present in DataFrame: {X.isnull().values.any()}")
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    # Fit the imputer on the data and transform it
    # # The imputer learns the mean from the non-NaN values in each column
    # # Then it replaces NaNs with the learned means
    X_imputed = imputer.fit_transform(X)
    cluster_labels = kmeans.fit_predict(X_imputed)
else:
    cluster_labels = kmeans.fit_predict(X)



NaNs present in DataFrame: True


In [30]:
adata.obs['neighbour_niches'] = cluster_labels.astype(str)
adata.write_h5ad("../../../../data/Mouse_Embryo/Mouse_Embryo_adata_final_processed.h5ad")

... storing 'neighbour_niches' as categorical


In [31]:
adata.obs

Unnamed: 0,object_id,bin_count,array_row,array_col,counts_per_nucleus,features_per_nucleus,bin_count_log,counts_per_nucleus_log,features_per_nucleus_log,n_genes_by_counts,...,zero_mt,cell_nucleus,counts_per_cell,features_per_cell,counts_per_cell_log,features_per_cell_log,cell_types,leiden,n_neighbors_within_max_dist,neighbour_niches
46,46,5,2833.200000,941.400000,434.676922,189.0,0.698970,2.638167,2.276462,189,...,False,nucleus,,,,,leukocyte,0,,8
235,235,3,2780.666667,1232.333333,233.735638,159.0,0.477121,2.368725,2.201397,159,...,False,nucleus,,,,,fibroblast of cardiac tissue,1,,17
361,361,17,2820.176471,1265.529412,1054.955591,526.0,1.230449,3.023234,2.720986,526,...,False,nucleus,,,,,basal cell of epidermis,3,,4
701,701,2,2905.500000,1245.500000,66.012372,42.0,0.301030,1.819625,1.623249,42,...,False,nucleus,,,,,hepatocyte,4,,4
815,815,7,2806.000000,1714.000000,201.396358,120.0,0.845098,2.304052,2.079181,120,...,False,nucleus,,,,,leukocyte,2,,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272956,272956,23,205.347826,1021.434783,,,1.361728,,,756,...,False,cell,586.095408,756.0,2.767968,2.878522,bladder cell,8,,4
273253,273253,13,214.538462,873.384615,,,1.113943,,,311,...,False,cell,240.166373,311.0,2.380512,2.492760,bladder cell,1,,9
274001,274001,5,262.400000,954.800000,,,0.698970,,,340,...,False,cell,269.129569,340.0,2.429961,2.531479,endothelial cell of coronary artery,7,,3
278145,278145,7,318.714286,1685.428571,,,0.845098,,,270,...,False,cell,397.126425,270.0,2.598929,2.431364,thymocyte,7,,10
