In [1]:
import scanpy as sc
import scanpy.external as sce
#sc.logging.print_versions()
#sc.logging.print_memory_usage()
#sc.settings.verbosity = 2
import os,sys
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import matplotlib.ticker as mticker

In [2]:
# add the utility function folder to PATH
import sys
sys.path.append(os.path.abspath("utility_functions/"))

from rz_import_statements import *
import rz_functions as rz
import rz_fig_params # this adjust mpl.rcParams, almost nothing to import, import after scanpy to overwrite rc.Params
import rz_utility_spring as srz

python version: 3.8.18


## Load data

In [3]:
#load the newest obs with doublet and RBC removal
filename = 'backups/obs_info_clean_good_15372x30_231009_12h06.npz'
encoding = 'latin1'

with np.load(filename,encoding=encoding, allow_pickle = True) as f:
    obs = pd.DataFrame(**f)

In [56]:
# load intermediates I saved when preparing the SPRING plot
path1 = "/Users/karolisgoda/Library/CloudStorage/GoogleDrive-karolisgoda@gmail.com/My Drive/MF/data/SPRING_dev-master/data/" 
project_dir = path1+'CD34_good/clean_good_sc_hvg/'
plot_name = 'palantir_cell_cycle'


#params = rz.load_stuff(project_dir+plot_name+'/params.pickle')
#params.keys()

In [57]:
#the umap and graph data is saved in adata file
adatag = sc.read_h5ad('backups/clean_good_sc_hvg_15372x3324_231129_15h00.h5ad')


In [58]:
adatag

AnnData object with n_obs × n_vars = 15372 × 3324
    obs: 'barcode', 'library', 'total_counts', 'pct_counts_mito', 'conditions', 'conditions2', 'preservation', 'method', 'doublet_score', 'potential_doublet', 'top3pct_dbtl_score', 'top5pct_dbtl_score', 'top10pct_dbtl_score', 'no_dblt', 'n_counts', 'cell_typist_pred_low', 'leiden_res_0.8', 'cell_typist_pred_high', 'cell_typist_pred_low_no_voting', 'cell_type', 'S_score', 'G2M_score', 'phase', 'palantir_pseudotime', 'palantir_entropy'
    var: 'n_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'DM_EigenValues', 'cell_type_colors', 'conditions2_colors', 'draw_graph', 'hvg', 'log1p', 'method_colors', 'neighbors', 'palantir_waypoints', 'pca', 'preservation_colors', 'umap'
    obsm: 'DM_EigenVectors', 'DM_EigenVectors_multiscaled', 'X_draw_graph_fa', 'X_pca', 'X_pca_harmony', 'X_umap', 'palantir_fate_probabilities'
    varm: 'PCs'
    layers: 'MAGIC_imputed_data'
    obsp: 'DM_Kernel'

In [None]:
adatag.X = adatag.layers['X_lin_cptt']

In [59]:
G  = adatag.obsp['connectivities'] #selecting the graph

In [6]:
adatag.X.sum(axis=1)

array([ 112.40536,  280.8118 ,  512.42334, ..., -458.6493 , -341.89734,
       -386.73096], dtype=float32)

# Leiden

In [60]:
cg = {}
for i in [0.2, 0.4, 0.6, 0.8, 1]:
    mock = adatag.copy()
    sc.tl.leiden(mock, resolution=i)
    l = list(mock.obs['leiden'])
    cg['leiden_res_%.1f'%i] = l
    print(len(set(l)))

8
8
10
11
13


# Phenograph clustering

In [16]:
# Clustering using PhenoGraph
for i in [0.1, 0.2, 0.3, 0.4]:
    sc.external.tl.phenograph(adatag, clustering_algo='leiden', k=15, jaccard=True, primary_metric='euclidean', 
                          resolution_parameter = i)
    l = list(adatag.obs['pheno_leiden'])
    cg['pheno_leiden_res_%.1f'%i] = l
    print(len(set(l)))

Finding 15 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8490519523620605 seconds
Jaccard graph constructed in 2.0189030170440674 seconds
Running Leiden optimization
Leiden completed in 0.7507390975952148 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 4.803226947784424 seconds
11
Finding 15 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.680574893951416 seconds
Jaccard graph constructed in 1.5680768489837646 seconds
Running Leiden optimization
Leiden completed in 0.9712340831756592 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 4.274828910827637 seconds
14
Finding 15 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6810421943664551 seconds
Jaccard graph constructed in 1.5630178451538086 seconds
Running Leiden optimization
Leiden completed in 0.9655230045318604 seconds
Sorting communities by size, please wait ...
P

# Spectral clustering (choose number of clusters)

In [61]:
# this one is much slower
for i in [
    9,10,11,12,13]:
    key = 'sp_cl_%d'%(i)
    print(key)
    cg[key] = list(srz.spec_clust(G,i).astype(str))
    print(key)

sp_cl_9
sp_cl_9
sp_cl_10
sp_cl_10
sp_cl_11
sp_cl_11
sp_cl_12
sp_cl_12
sp_cl_13
sp_cl_13


# Append result to the same SPRING plot

In [62]:
# load current color dictionary
cg0 = srz.read_cell_groupings(project_dir+plot_name+'/categorical_coloring_data.json')


# color dictionary of dictionaries
cdd = {key:value['label_colors'] for key,value in cg0.items()}


In [63]:
cg = {key:list(np.array(value).astype(str)) for key,value in cg.items()}

In [64]:
project_dir+plot_name+'/categorical_coloring_data.json'

'/Users/karolisgoda/Library/CloudStorage/GoogleDrive-karolisgoda@gmail.com/My Drive/MF/data/SPRING_dev-master/data/CD34_good/clean_good_sc_hvg/palantir_cell_cycle/categorical_coloring_data.json'

In [65]:
# append categorical colortrack
srz.append_cell_groupings(project_dir+plot_name,cg,colordd=cdd)

In [67]:
for key, value in cg.items():
    adatag.obs[key] = cg[key]

In [68]:
obs1 = adatag.obs

In [70]:
#save obs with clusters

fname = 'backups/obs_info_sc_hvg_cell_cycle_cluster_%dx%d_%s'%(obs1.shape[0],obs1.shape[1],rz.now())
print(fname)
rz.save_df(obs1,fname)

backups/obs_info_sc_hvg_cell_cycle_cluster_15372x34_231129_15h28
