## Loading the packages

In [None]:
import scvi
import numpy as np
import pandas as pd
import os

import scanpy as sc
from anndata import AnnData, concat
from matplotlib.transforms import Bbox

# from scvi.model.utils import mde


from matplotlib import pyplot as plt
from matplotlib import colors
from matplotlib import cm
import seaborn as sns

from scipy import sparse
import gc
from upsetplot import plot as UpSet
from itertools import product
import networkx as nx

from scipy.cluster import hierarchy

import re
import subprocess

import h5py

import itertools

from scnym.api import scnym_api

import pymn
import random 
from collections import Counter

In [None]:
# import tensorflow as tf
# import tensorboard as tb
# tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

pd.set_option('display.max_columns', 500)

In [None]:
def read_preprocess_data(input_h5):
    input_adata = sc.read_10x_h5(input_h5)
    input_adata.var_names_make_unique()
    sc.pp.normalize_total(input_adata, target_sum = 1e6)
    sc.pp.log1p(input_adata)
    sc.pp.highly_variable_genes(input_adata, n_top_genes=3000)
    sc.pp.pca(input_adata)
    sc.pp.neighbors(input_adata)
    sc.tl.umap(input_adata)
    return(input_adata)

def full_extent(ax, pad=0.0):
    """Get the full extent of an axes, including axes labels, tick labels, and
    titles."""
    # For text objects, we need to draw the figure first, otherwise the extents
    # are undefined.
    ax.figure.canvas.draw()
    items = ax.get_xticklabels() + ax.get_yticklabels() 
#    items += [ax, ax.title, ax.xaxis.label, ax.yaxis.label]
    items += [ax, ax.title]
    bbox = Bbox.union([item.get_window_extent() for item in items])

    return bbox.expanded(1.0 + pad, 1.0 + pad)

def set_max_score (rec_arr, max_score = 20):
    rec_mod = np.copy(rec_arr)
    for i in range(len(rec_arr)):
            for j in range(len(rec_mod[i])):
                    if rec_mod[i][j] > max_score:
                        rec_mod[i][j] = max_score
    return(rec_mod)


In [None]:
def write_rank_genes_groups_to_excel(adata, excel_path, groups = None):
    if groups is None:
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
    
    writer = pd.ExcelWriter(excel_path, engine = 'xlsxwriter')
    
    for groupid in groups:
        group_df = sc.get.rank_genes_groups_df(adata, group=groupid)
        group_df.to_excel(writer, sheet_name = groupid)
    writer.save()
    # writer.close()


In [None]:
bm_dir = os.getcwd() + "/../SA_data"
bmSA_pilot_h5 = bm_dir + "/MR_30_10X_SA_PBS_BM_22112021_transcriptome" + "/filtered_feature_bc_matrix.h5"
bmSA_final_h5 = bm_dir + "/MR_33_10X_SA_PBS_BM_transcriptome" + "/filtered_feature_bc_matrix.h5"


In [None]:
# bmSA_pilot = read_preprocess_data(bmSA_pilot_h5)
# bmSA_final = read_preprocess_data(bmSA_final_h5)

In [None]:
dataset_fns = [
    bmSA_pilot_h5,
    bmSA_final_h5,
    'mouse_hsc_labeled.loom',
    'erythroid_and_monocyte_lineage_adata_no_gaps.loom',
    'processed_droplet_data_no_OBSM.loom'
]

dataset_names =[
    'SA_pilot',
    'SA_final',
    'Mouse_HSC',
    'Erythroid_monocyte',
    'annotated'
]
dataset_dict = dict(zip(dataset_names, dataset_fns))


In [None]:
raw_file_dir = "/users/anna.hakobyan/projects/bone_marrow_sc/mouse_bm_data/input_data/"
# datasets = []
for dataset in dataset_dict:
    try:
        print(dataset)
            
        if (dataset.startswith("SA")):
            adata = read_preprocess_data(dataset_dict[dataset])
        else:
            adata=sc.read_loom(raw_file_dir + dataset_dict[dataset])
            # adata.var_names = adata.var['var_names']
            
        adata.obs['study_id'] = dataset
        datasets.append(adata)
    except:
        print(dataset + ' Not Found')

In [None]:
combined_datasets = datasets[-1].concatenate(datasets[:-1])

In [None]:
combined_datasets.obs.cell_ontology_class.fillna('Unlabeled',inplace=True)
combined_datasets=combined_datasets[combined_datasets.obs.cell_ontology_class !='nan']

In [None]:
combined_datasets.obs["cell_ontology_class"].value_counts()

In [None]:
cell_mapping = {"Unlabeled" : "Unlabeled",
                "granulocyte": "granulocyte",
                "hematopoietic precursor cell" : "hematopoietic precursor cell",
                "monocyte" : "monocyte",
                "naive B cell" : "B cell",
                "late pro-B cell" : "B cell",
                "pro-B cell" : "B cell",
                "immature B cell" : "B cell",
                "macrophage" : "macrophage",
                "proerythroblast" : "erythroblast",
                "T cell" : "T cell",
                "erythroblast" : "erythroblast",
                "basophil" : "basophil",
                "immature natural killer cell" : "NK cell",
                "promonocyte" : "monocyte",
                "early pro-B cell" : "B cell",
                "Fraction A pre-pro B cell" : "B cell",
                "granulocytopoietic cell" : "granulocyte",
                "B cell" : "B cell",
                "Slamf1-positive multipotent progenitor cell" : "Slamf1-positive multipotent progenitor cell"}

In [None]:
combined_datasets.obs["cell_ontology_condensed"] = combined_datasets.obs["cell_ontology_class"].map(cell_mapping)

In [None]:
%%time
scnym_api(adata=combined_datasets,
          task='train',
          groupby='cell_ontology_condensed',
          out_path='./scnym_output_condensed',
          config={
              'domain_groupby': 'study_id',
              'dan_use_conf_pseudolabels': False,
              'pseudolabel_min_confidence':.9
          })

In [None]:
%%time
scnym_api(
    adata=combined_datasets,
    task='predict',
    key_added='scNym_condensed',
    trained_model='./scnym_output_condensed',
    out_path='./scnym_output_condensed',
    config='new_identity_discovery'
)


### Writing the annotations

In [None]:
combined_datasets.obs.to_pickle("TM_combined_datasets_SA_ann_obs.plk")
combined_datasets.obs = combined_datasets.obs[["study_id", "cell_ontology_class"]]
combined_datasets.write(filename = "TM_combined_datasets_SA_ann.h5ad")

### Reading the annotations

In [None]:
### Directory TM_combined_datasets_SA_ann_obs
combined_datasets = sc.read_h5ad(filename = "TM_combined_datasets_SA_ann.h5ad")
obs_plk = pd.read_pickle("TM_combined_datasets_SA_ann_obs.plk")
combined_datasets.obs = obs_plk

In [None]:
# combined_datasets.X[:4, :4].todense()

In [None]:
# del(combined_datasets)

In [None]:
combined_datasets.obs.boxplot(column = ["scNym_condensed_confidence"], by = ["scNym_condensed"])
plt.xticks(rotation=90)

In [None]:
combined_datasets.obs["scNym_condensed"].value_counts()

### Comparing with available annotations

In [None]:
hvg = sc.read("../output/hvg_integrated.h5ad")
SA_exps = combined_datasets[combined_datasets.obs["study_id"].isin(['SA_pilot', 'SA_final'])]

In [None]:
hvg.shape

In [None]:
hvg_cells = hvg.obs_names
SA_exp_names = SA_exps.obs_names

SA_hvg_cells = [x for x in SA_exp_names if x[:-2] in hvg_cells]


SA_cellname_counts = Counter([x[:-2] for x in SA_hvg_cells])
SA_unique_names = [x for x in SA_hvg_cells if SA_cellname_counts[x[:-2]] == 1]


SA_exps_subset = SA_exps[SA_unique_names]

SA_exps_subset.obs_names = [x[:-2] for x in SA_exps_subset.obs_names]

hvg_subset = hvg[SA_exps_subset.obs_names]

hvg_subset.obs["scNym_condensed"] = SA_exps_subset.obs["scNym_condensed"]
hvg_subset.obs["scNym_condensed_confidence"] = SA_exps_subset.obs["scNym_condensed_confidence"]

In [None]:
SA_exps_subset.obs_names

In [None]:
sc.pl.umap(SA_exps_subset,
    color= ['scNym_condensed', 'study_id']
          )

In [None]:
SA_exps.obs['scNym_condensed'].value_counts()

In [None]:
SA_exps.obs.boxplot(column = ["scNym_condensed_confidence"], by = ["scNym_condensed"])
plt.xticks(rotation=90)

In [None]:
hsc_confidence = SA_exps.obs[SA_exps.obs["scNym_condensed"] == "hematopoietic precursor cell"]["scNym_condensed_confidence"]

In [None]:
len(hsc_confidence)

In [None]:
hvg_subset.obs["condition"] = [x[4:-1] for x in hvg_subset.obs["HTO_classification"]]

In [None]:
hvg_subset.obs["scNym_condensed"]

In [None]:
hvg_subset.obs.boxplot(column = ["scNym_condensed_confidence"], by = ["scNym_condensed"])
plt.xticks(rotation=90)

In [None]:
sc.tl.umap(hvg_subset)

In [None]:
os.getcwd()

In [None]:
sc.pl.embedding(
    hvg_subset,
    basis="umap",
    color=['scNym_condensed', 'condition'],
    frameon=False,
    ncols=1,
    save = "scNym_umap.pdf"
)

In [None]:
sc.pl.embedding(
    hvg_subset,
    basis="umap",
    color=['batch'],
    frameon=False,
    ncols=1,
    save = "batch_all_cells_integrated.pdf"
)

In [None]:
sc.pl.embedding(
    hvg_subset,
    basis="umap",
    color=['scNym_condensed', 'condition'],
    frameon=False,
    ncols=1,
    legend_loc='on data',
    save = "scNym_umap_labeled.pdf"
)

In [None]:
sc.pl.embedding(
    hvg_subset,
    basis="X_mde",
    color=['scNym_condensed', 'condition'],
    frameon=False,
    ncols=1,
    save = "scNym_mde.pdf"
)

In [None]:
sc.pl.embedding(
    hvg_subset,
    basis="X_mde",
    color=['scNym_condensed', 'condition'],
    frameon=False,
    ncols=1,
    legend_loc='on data',
    save = "scNym_mde_labeled.pdf"
)


In [None]:
fig, (ax1, ax2) =  plt.subplots(2, 1, figsize=(7, 10))
p1 = sc.pl.embedding(
    hvg_subset[hvg_subset.obs["condition"] == "PBS"],
    basis="X_mde",
    title = "PBS",
    color='scNym_condensed',
    size = 20,
    frameon=False,
    ncols=1,
    show = False,
    ax = ax1
)
p2 = sc.pl.embedding(
    hvg_subset[hvg_subset.obs["condition"] == "SA"],
    basis="X_mde",
    title = "SA",
    size = 20,
    color='scNym_condensed',
    frameon=False,
    ncols=1,
    show = False,
    ax = ax2
)

# extent = full_extent(ax2).transformed(fig.dpi_scale_trans.inverted())
fig.savefig("figures/facet_SA_mde_scNym.pdf", bbox_inches = "tight")

In [None]:
fig, (ax1, ax2) =  plt.subplots(2, 1, figsize=(7, 10))
p1 = sc.pl.embedding(
    hvg_subset[hvg_subset.obs["condition"] == "PBS"],
    basis="umap",
    title = "PBS",
    color='scNym_condensed',
    size = 20,
    frameon=False,
    ncols=1,
    show = False,
    ax = ax1
)
p2 = sc.pl.embedding(
    hvg_subset[hvg_subset.obs["condition"] == "SA"],
    basis="umap",
    title = "SA",
    size = 20,
    color='scNym_condensed',
    frameon=False,
    ncols=1,
    show = False,
    ax = ax2
)

# extent = full_extent(ax2).transformed(fig.dpi_scale_trans.inverted())
fig.savefig("figures/facet_SA_umap_scNym.pdf", bbox_inches = "tight")

### Rank genes groups

In [None]:
# sc.pp.normalize_total(hvg_subset, target_sum=1e6)
sc.pp.log1p(hvg_subset)

In [None]:
hvg_subset.X.toarray().max(axis = 0).max()

In [None]:
len(hvg_subset)

In [None]:
hvg_subset.obs['scNym_condensed'].value_counts()

In [None]:
sc.tl.rank_genes_groups(hvg_subset, 'scNym_condensed', method='wilcoxon', use_raw = False)

In [None]:
sc.pl.rank_genes_groups_heatmap(hvg_subset, n_genes = 5, use_raw = False, save = "scNym_annotation_all_classes_no_genenames.pdf")

In [None]:
sc.pl.rank_genes_groups_heatmap(hvg_subset, n_genes = 5, use_raw = False, show_gene_labels=True,save = "scNym_annotation_all_classes_with_genenames.pdf")

In [None]:
# hvg_subset.uns['rank_genes_groups']

In [None]:
# to visualize the results
sc.pl.rank_genes_groups(hvg_subset, ncol = 3, save = "scNym_all_celltype_diff_exps_genes.pdf")

### Checking the UMAP for batches with log transform

In [None]:
sc.tl.pca(hvg_subset)
sc.pp.neighbors(hvg_subset)
sc.tl.umap(hvg_subset, random_state=1)

In [None]:
# sc.set_figure_params(figsize = (5,4))
sc.pl.umap(hvg_subset, color=[ 'scNym_condensed', 'batch' ], size = 30
           # ,save = "all_HSC_umap.pdf"
          )

### Writing DEG genes to an excel file

In [None]:
sc.tl.rank_genes_groups(hvg_subset, 'scNym_condensed', method='wilcoxon', use_raw = False)

In [None]:
write_rank_genes_groups_to_excel(adata = hvg_subset, excel_path = "all_BM_cells_scNym_DEGs.xlsx")

### Annotation of hematopoietic precursor cells based on facs data 

In [None]:
SA_HPC = SA_exps[ SA_exps.obs["scNym_condensed"] == "hematopoietic precursor cell" ]

In [None]:
SA_HPC.obs_names

In [None]:
SA_HPC.shape

In [None]:
combined_HSC = datasets[2].concatenate(SA_HPC)

In [None]:
combined_HSC.obs.FACS_labels.fillna('Unlabeled',inplace=True)
combined_HSC=combined_HSC[combined_HSC.obs.FACS_labels != 'nan']

In [None]:
combined_HSC.obs.FACS_labels

In [None]:
os.getcwd()

In [None]:
%%time
scnym_api(adata=combined_HSC,
          task='train',
          groupby='FACS_labels',
          out_path='./scnym_output_HPC',
          config={
              'domain_groupby': 'study_id',
              'dan_use_conf_pseudolabels': False,
              'pseudolabel_min_confidence':.9
          })

In [None]:
%%time
scnym_api(
    adata=combined_HSC,
    task='predict',
    key_added='scNym_HPC',
    trained_model='./scnym_output_HPC',
    out_path='./scnym_output_HPC',
    config='new_identity_discovery'
)

### Writing cominbed_HSC

In [None]:
combined_HSC.obs.to_pickle("TM_combined_HSC_SA_obs.plk")
combined_HSC.obs = combined_HSC.obs[["study_id", "cell_ontology_class"]]
combined_HSC.write(filename = "TM_combined_HSC_SA.h5ad")
combined_HSC.write_loom(filename = "TM_combined_HSC_SA.loom")


### Reading combined_HSC

In [None]:
### Directory TM_combined_HSC_SA_ann_obs
combined_HSC = sc.read_h5ad(filename = "TM_combined_HSC_SA.h5ad")
obs_plk = pd.read_pickle("TM_combined_HSC_SA_obs.plk")
combined_HSC.obs = obs_plk
combined_HSC = sc.read_loom(filename = "TM_combined_HSC_SA.loom")


### Writing scNym annotated loom file for HSCs

In [None]:
combined_HSC.write_loom(filename = "TM_combined_HSC_SA_ann.loom", write_obsm_varm=True)

### Reading scNym annotated loom file for HSCs

In [None]:
os.getcwd()

In [None]:
combined_HSC = sc.read_loom(filename = "TM_combined_HSC_SA_ann.loom")

In [None]:
annotated_SA_HSC = combined_HSC[ combined_HSC.obs["study_id"].isin(["SA_final", "SA_pilot"])]

In [None]:
annotated_SA_HSC.obs_names

In [None]:
hvg_subset.obs_names

In [None]:
ann_counts = Counter([x[:-4] for x in annotated_SA_HSC.obs_names])
unique_names = [x for x in annotated_SA_HSC.obs_names if ann_counts[x[:-4]] == 1]
annotated_SA_HSC = annotated_SA_HSC[unique_names]
annotated_SA_HSC.obs_names = [x[:-4] for x in annotated_SA_HSC.obs_names]

In [None]:
annotated_SA_HSC.shape

In [None]:
common_cells = annotated_SA_HSC.obs_names.intersection(hvg_subset.obs_names)

In [None]:
len(set(common_cells))

In [None]:
hvg_subset_HSC = hvg_subset[common_cells]

In [None]:
annotated_SA_HSC_hvg = annotated_SA_HSC[common_cells]

In [None]:
hvg_subset_HSC.obs["HSC_annotation"] = annotated_SA_HSC_hvg.obs["scNym_HPC"]

In [None]:
hvg_subset_HSC.obs["HSC_annotation"].value_counts()

In [None]:
fig, (ax1, ax2) =  plt.subplots(2, 1, figsize=(7, 10))
p1 = sc.pl.embedding(
    hvg_subset_HSC[hvg_subset_HSC.obs["condition"] == "PBS"],
    basis="X_mde",
    title = "PBS",
    color='HSC_annotation',
    size = 20,
    frameon=False,
    ncols=1,
    show = False,
    ax = ax1
)
p2 = sc.pl.embedding(
    hvg_subset_HSC[hvg_subset_HSC.obs["condition"] == "SA"],
    basis="X_mde",
    title = "SA",
    size = 20,
    color='HSC_annotation',
    frameon=False,
    ncols=1,
    show = False,
    ax = ax2
)

# extent = full_extent(ax2).transformed(fig.dpi_scale_trans.inverted())
fig.savefig("figures/facet_SA_mde_HSC_FACS_annotation.pdf", bbox_inches = "tight")

### Writing hvg_subset_HSC

In [None]:
hvg_subset_HSC.obs.to_pickle("hvg_lineage_annotation.plk")
hvg_subset_HSC.obs = hvg_subset_HSC.obs[["HSC_annotation"]]
hvg_subset_HSC.write(filename = "hvg_lineage_annotation.h5ad")

### Reading hvg_subset_HSC

In [None]:
### Directory TM_hvg_subset_HSC_SA_ann_obs
hvg_subset_HSC = sc.read_h5ad(filename = "hvg_lineage_annotation.h5ad")
obs_plk = pd.read_pickle("hvg_lineage_annotation.plk")
hvg_subset_HSC.obs = obs_plk

In [None]:
hvg_subset_HSC.raw


In [None]:
random.seed(1342465)
sc.tl.pca(hvg_subset_HSC)
sc.pp.neighbors(hvg_subset_HSC)
sc.tl.umap(hvg_subset_HSC, random_state=10)
sc.tl.leiden(hvg_subset_HSC, resolution=0.48, n_iterations=100)

In [None]:
sc.set_figure_params(figsize = (5,4))
sc.pl.umap(hvg_subset_HSC, color=['HSC_annotation', 'leiden'], size = 30
           # ,save = "all_HSC_umap.pdf"
          )

In [None]:
sc.set_figure_params(figsize = (5,4))
sc.pl.umap(hvg_subset_HSC, color='batch', size = 30
           # ,save = "all_HSC_umap.pdf"
          )

In [None]:
sc.pp.normalize_total(hvg_subset_HSC, target_sum=1e6)
sc.pp.log1p(hvg_subset_HSC)

In [None]:
sc.tl.pca(hvg_subset_HSC)
sc.pp.neighbors(hvg_subset_HSC)
sc.tl.umap(hvg_subset_HSC, random_state=1)


In [None]:
sc.set_figure_params(figsize = (5,4))
sc.pl.umap(hvg_subset_HSC, color='HSC_annotation', size = 30
           # ,save = "all_HSC_umap.pdf"
          )

### Heatmap of differentially expressed genes

In [None]:
hvg_subset_HSC = hvg_subset_HSC[hvg_subset_HSC.obs['HSC_annotation'] != "Unknown"]

In [None]:
hvg_subset_HSC.obs["HSC_annotation"].value_counts()

In [None]:
# sc.tl.pca(hvg_subset_HSC, svd_solver='arpack')
# sc.pp.neighbors(hvg_subset_HSC, n_neighbors=10, n_pcs=40)
# sc.tl.umap(hvg_subset_HSC)

In [None]:
# sc.tl.leiden(hvg_subset_HSC)

In [None]:
# sc.tl.rank_genes_groups(hvg_subset_HSC, 'leiden', method='wilcoxon', use_raw = False)

In [None]:
sc.pp.normalize_total(hvg_subset_HSC, target_sum=1e6)
sc.pp.log1p(hvg_subset_HSC)

In [None]:
sc.tl.rank_genes_groups(hvg_subset_HSC, 'HSC_annotation', method='wilcoxon', use_raw = False)

In [None]:
sc.pl.rank_genes_groups_heatmap(hvg_subset_HSC, use_raw = False, n_genes = 5, save = 'HSC_top5_genes_heatmap.pdf')

In [None]:
sc.pl.rank_genes_groups_heatmap(hvg_subset_HSC, use_raw = False, save = 'HSC_genes_heatmap.pdf')

### Writing DEG genes to an excel file

In [None]:
write_rank_genes_groups_to_excel(adata = hvg_subset_HSC, excel_path = "HSC_DEGs_scnym.xlsx")