In [None]:
%env PYTHONHASHSEED=0

In [None]:
import numpy as np
import pandas as pd
import os

import scanpy as sc
from anndata import AnnData, concat
from matplotlib.transforms import Bbox

# from scvi.model.utils import mde


from matplotlib import pyplot as plt
import seaborn as sns

from scipy import sparse
import gc
# from upsetplot import plot as UpSet
from itertools import product
import networkx as nx

from scipy.cluster import hierarchy

import re
import subprocess

import h5py

import itertools

# from scnym.api import scnym_api

# import pymn

from collections import Counter

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
out_dir = "/home/anna/ClusterProjects/SA_bone_marrow/scfiles/"

# HSC population

### Reading the unannotated loom file

In [None]:
hvg_subset_HSC_annot = sc.read_h5ad(filename = out_dir + "hvg_HSC_X_scVI_based_200924.h5ad")

In [None]:
hvg_subset_HSC_annot.obsm

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset_HSC_annot,
    basis="umap",
    color=['leiden', 'batch', "scNym_HPC"],
    frameon=False,
    ncols=1
)

In [None]:
manual_cell_mapping = {"0":"GMP CMP",
               "1": "Proliferating GMP",
               "2": "CLP",
               "3": "Erythroid megakaryocyte progenitor",
               "4": "Granulocyte progenitor",
               "5": "B cell progenitor"}
hvg_subset_HSC_annot.obs["manual_cell_mapping"] = hvg_subset_HSC_annot.obs["leiden"].map(manual_cell_mapping)

In [None]:
hvg_subset_HSC_annot.write_loom(filename = out_dir + "hvg_HSC_X_scVI_based_annotated_220924.loom")
hvg_subset_HSC_annot.write_h5ad(filename = out_dir + "hvg_HSC_X_scVI_based_annotated_220924.h5ad")

In [None]:
hvg_subset_HSC_annot = sc.read_h5ad (filename = out_dir + "hvg_HSC_X_scVI_based_annotated_220924.h5ad")

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset_HSC_annot,
    basis="umap",
    color=['batch', "manual_cell_mapping"],
    frameon=False,
    ncols=1,
    save = "HSC_x_scvi_manual_annotation_22092024.pdf"
)

In [None]:
import os

In [None]:
import pandas as pd

# Read the Excel file
file_path = "/home/anna/ClusterProjects/SA_bone_marrow/annotated_clusters/Signature genes _ HSC cluster annotation .xlsx"
excel_data = pd.ExcelFile(file_path)

# Initialize an empty dictionary
HSC_signature_genes = {}

# Loop through each sheet and store values of the "name" column in the dictionary
for sheet_name in excel_data.sheet_names:
    # Read each sheet into a DataFrame
    df = pd.read_excel(excel_data, sheet_name=sheet_name)
    
    # Check if the "name" column exists
    if 'name' in df.columns:
        # Add the values of the "name" column to the dictionary
        filtered_df = df[df['pvalue'] <= 0.05]
        HSC_signature_genes[sheet_name] = filtered_df['name'].tolist()

# Print the resulting dictionary
print(HSC_signature_genes)

In [None]:
sc.pp.normalize_total(hvg_subset_HSC_annot, target_sum=1e6)
sc.pp.log1p(hvg_subset_HSC_annot)

# Calculate z-scores across cells for each gene
hvg_subset_HSC_annot.layers['z_scores'] = (hvg_subset_HSC_annot.X - np.mean(hvg_subset_HSC_annot.X, axis=0)) / np.std(hvg_subset_HSC_annot.X, axis=0)

In [None]:
# sc.set_figure_params(figsize = (8,6))
# for key in HSC_signature_genes.keys():
#     print (key)
#     sc.pl.embedding(
#         hvg_subset_HSC_annot,
#         basis="umap",
#         layer='z_scores',
#         color=HSC_signature_genes[key],
#         frameon=False,
#         # ncols=1,
#         save = "HSC_X_scvi_cluster_signature_genes_" + key + ".pdf"
#     )

# Mature population

In [None]:
hvg_subset_mature_annot = sc.read_h5ad(filename = out_dir + "hvg_mature_X_scVI_based_200924.h5ad")

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset_mature_annot,
    basis="umap",
    color=['leiden', 'batch', "scNym_condensed"],
    frameon=False,
    ncols=1
)

In [None]:
manual_cell_mapping = {"0": "Early granulocyte",
               "1": "Dendritic cell",
               "2": "Neutrophil",
               "3": "Granulocyte",
               "4": "Activated DC and macrophage",
               "5": "MC basophil",
               "6": "Cytotoxic T and NK cells",
                   "7" : "Pro B cell",
                   "8": "Neutrophil",
                   "9": "B cell",
                   "10": "Activated monocyte",
                   "11": "Th2 CD4+ T cell",
                   "12": "Activated macrophage",
                   "13":"Erythroblast",
                   "14":"Neutrophil",
                   "15":"Other"}
hvg_subset_mature_annot.obs["manual_cell_mapping"] = hvg_subset_mature_annot.obs["leiden"].map(manual_cell_mapping)

In [None]:
hvg_subset_mature_annot.write_loom(filename = out_dir + "hvg_mature_X_scVI_based_annotated_220924.loom")
hvg_subset_mature_annot.write_h5ad(filename = out_dir + "hvg_mature_X_scVI_based_annotated_220924.h5ad")

In [None]:
hvg_subset_mature_annot = sc.read_h5ad(filename = out_dir + "hvg_mature_X_scVI_based_annotated_220924.h5ad")

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset_mature_annot,
    basis="umap",
    color=['batch', "manual_cell_mapping"],
    frameon=False,
    ncols=1,
    save = "mature_x_scvi_manual_annotation_22092024.pdf"
)

In [None]:
# Read the Excel file
file_path = "/home/anna/ClusterProjects/SA_bone_marrow/annotated_clusters/Signature genes _ mature cluster annotation.xlsx"
excel_data = pd.ExcelFile(file_path)

# Initialize an empty dictionary
mature_signature_genes = {}

# Loop through each sheet and store values of the "name" column in the dictionary
for sheet_name in excel_data.sheet_names:
    # Read each sheet into a DataFrame
    df = pd.read_excel(excel_data, sheet_name=sheet_name)
    
    # Check if the "name" column exists
    if 'name' in df.columns:
        # Add the values of the "name" column to the dictionary
        filtered_df = df[df['pvalue'] <= 0.05]
        mature_signature_genes[sheet_name] = filtered_df['name'].tolist()

# Print the resulting dictionary
print(mature_signature_genes)

In [None]:
sc.pp.normalize_total(hvg_subset_mature_annot, target_sum=1e6)
sc.pp.log1p(hvg_subset_mature_annot)

# Calculate z-scores across cells for each gene
hvg_subset_mature_annot.layers['z_scores'] = (hvg_subset_mature_annot.X - np.mean(hvg_subset_mature_annot.X, axis=0)) / np.std(hvg_subset_mature_annot.X, axis=0)

In [None]:
# sc.set_figure_params(figsize = (8,6))
# for key in mature_signature_genes.keys():
#     print (key)
#     sc.pl.embedding(
#         hvg_subset_mature_annot,
#         basis="umap",
#         layer='z_scores',
#         color=mature_signature_genes[key],
#         frameon=False,
#         # ncols=1,
#         save = "mature_X_scvi_cluster_signature_genes_" + key + ".pdf"
#     )

In [None]:
combined_mature_signature_genes = sum(mature_signature_genes.values(), [])
len(combined_mature_signature_genes)

In [None]:
sc.pl.heatmap(hvg_subset_mature_annot, combined_mature_signature_genes, groupby='manual_cell_mapping', layer='z_scores',
             save = "mature_X_scvi_cluster_signature_genes_heatmap.pdf")

### Plotting all the cells

In [None]:
# import os
# os.chdir("python")

In [None]:
### Directory TM_combined_datasets_SA_ann_obs
combined_datasets = sc.read_h5ad(filename = "../data/h5ad/TM_combined_datasets_SA_ann.h5ad")
obs_plk = pd.read_pickle("../data/h5ad/TM_combined_datasets_SA_ann_obs.plk")
combined_datasets.obs = obs_plk

hvg = sc.read("../data/h5ad/hvg_integrated_170824.h5ad")
SA_exps = combined_datasets[combined_datasets.obs["study_id"].isin(['SA_pilot', 'SA_final'])]

del(combined_datasets)

hvg_cells = hvg.obs_names
SA_exp_names = SA_exps.obs_names

SA_hvg_cells = [x for x in SA_exp_names if x[:-2] in hvg_cells]


SA_cellname_counts = Counter([x[:-2] for x in SA_hvg_cells])
SA_unique_names = [x for x in SA_hvg_cells if SA_cellname_counts[x[:-2]] == 1]


SA_exps_subset = SA_exps[SA_unique_names]

SA_exps_subset.obs_names = [x[:-2] for x in SA_exps_subset.obs_names]

hvg_subset = hvg[SA_exps_subset.obs_names]

hvg_subset.obs["scNym_condensed"] = SA_exps_subset.obs["scNym_condensed"]
hvg_subset.obs["scNym_condensed_confidence"] = SA_exps_subset.obs["scNym_condensed_confidence"]

In [None]:
manual_annotation = pd.concat([hvg_subset_HSC_annot.obs["manual_cell_mapping"], hvg_subset_mature_annot.obs["manual_cell_mapping"]])

In [None]:
hvg_subset.obs["manual_annotation"] = manual_annotation[hvg_subset.obs_names]

In [None]:
# sc.tl.pca(hvg_subset_HSC, svd_solver='arpack')
sc.pp.neighbors(hvg_subset, n_neighbors=10, n_pcs=30, random_state = 514, use_rep = "X_scVI")
sc.tl.umap(hvg_subset, random_state = 423)
sc.tl.leiden(hvg_subset, resolution = 0.3, n_iterations=-1)

In [None]:
manual_annotation.unique()

In [None]:
color_mapping = {
'Pro B cell': '#023fa5',
 'B cell': '#7d87b9',
 'MC basophil': '#bec1d4',
 'CLP': '#d6bcc0',
 'GMP CMP': '#bb7784',
 'Dendritic cell': '#8e063b',
 'B cell progenitor': '#4a6fe3',
 'Granulocyte': '#8595e1',
 'Granulocyte progenitor': '#b5bbe3',
 'Erythroblast': '#e6afb9',
 'Proliferating GMP': '#e07b91',
 'Activated DC and macrophage': '#d33f6a',
 'Erythroid megakaryocyte progenitor': '#11c638',
 'Activated macrophage': '#8dd593',
 'Activated monocyte': '#c6dec7',
 'Th2 CD4+ T cell': '#ead3c6',
 'Early granulocyte': '#f0b98d',
 'Neutrophil': '#ff9999',
 'Other': '#000000',
 'Cytotoxic T and NK cells': '#0fcfc0'}

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset,
    basis="umap",
    color=['manual_annotation'],
    frameon=False,
    ncols=1,
    legend_loc='on data',
    size = 60,
    palette = color_mapping,
    save = "all_celltypes_X_scVI_reannotated_ondata.pdf"
)

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset,
    basis="umap",
    color=['manual_annotation'],
    frameon=False,
    ncols=1,
    # legend_loc='on data',
    size = 60,
    palette = color_mapping,
    save = "all_celltypes_X_scVI_reannotated_legend_right.pdf"
)

In [None]:
hvg_subset.obs

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset,
    basis="umap",
    color=['batch'],
    frameon=False,
    ncols=1,
    # legend_loc='on data',
    size = 60,
    save = "all_celltypes_X_scVI_batch_legend_right.pdf"
)

In [None]:
hvg_subset.obs["condition"] = hvg_subset.obs['sample'].apply(lambda x: 'SA' if 'SA' in x else 'PBS')

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset,
    basis="umap",
    color=["condition"],
    frameon=False,
    ncols=1,
    # legend_loc='on data',
    size = 60,
    palette = {"PBS": "#BEBEBE", "SA": "#fcb13b"},
    save = "all_celltypes_X_scVI_condition_legend_right.pdf"
)

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset,
    basis="umap",
    color=["sample"],
    frameon=False,
    ncols=1,
    # legend_loc='on data',
    size = 30,
    palette = {"HTO-PBS1" : '#377eb8', "HTO-PBS2" : '#ff7f00', "HTO-PBS3" : '#4daf4a',
                  "HTO-PBS4" : '#f781bf', "HTO-SA1" : '#a65628', "HTO-SA2" : '#984ea3',
                  "HTO-SA3" :'#999999',"HTO-SA4" : '#dede00'},
    save = "all_celltypes_X_scVI_sample_legend_right.pdf"
)

### KI67 expression across clusters

In [None]:
hvg_subset.layers
hvg_subset.layers['z_scores'] = (hvg_subset.X - np.mean(hvg_subset.X, axis=0)) / np.std(hvg_subset.X, axis=0)

In [None]:
sc.pl.violin(adata=hvg_subset, keys = "Mki67", groupby="manual_annotation", use_raw=True, # layer = "z_scores", 
             rotation = 90,
            save = "KI67_violin_all_cells.pdf")

In [None]:
list(hvg_subset.obs["manual_annotation"].unique())

In [None]:
cell_types_of_interest = ['Early granulocyte', 'Proliferating GMP',"GMP CMP", "Neutrophil", "Granulocyte progenitor", "Activated monocyte", "Granulocyte" ]  # Replace with exact names from your dataset
hvg_subset_granulo_cells = hvg_subset[hvg_subset.obs["manual_annotation"].isin(cell_types_of_interest)]

In [None]:
sc.pl.violin(adata=hvg_subset_granulo_cells, keys = "Mki67", groupby="manual_annotation", use_raw=True, # layer = "z_scores", 
             rotation = 90,
            save = "KI67_violin_granulo_cells.pdf")

# Cell cycle info

In [None]:
cell_cycle_h5ad = sc.read_h5ad(filename = "../data/h5ad/hvg_integrated_170824_cell_cycle_phases.h5ad")
# bm_velo.obs_names = bm_velo.obs["obs_names"]
# cell_cycle = bm_velo.obs["cell_cycle_phase"]

In [None]:
cell_cycle_h5ad.obs_names

In [None]:
hvg_subset.obs["cell_cycle_phase"] = cell_cycle_h5ad[hvg_subset.obs_names].obs["cell_cycle_phase"]

In [None]:
cell_cycle_colors = {
    "G1-S":"#7d87b9", 
    "G2-M" : "#e07b91",
    "M": "#0fcfc0", 
    "M-G1": "#c6dec7", 
    "S": "#d33f6a"}

In [None]:
sc.set_figure_params(figsize = (8,6))
sc.pl.embedding(
    hvg_subset,
    basis="umap",
    color=['cell_cycle_phase'],
    frameon=False,
    ncols=1,
    # legend_loc='on data',
    size = 60,
    palette = cell_cycle_colors,
    save = "all_celltypes_X_scVI_cell_cycle.pdf")