In [2]:
!date

Tue Dec  1 12:16:34 PST 2020


# Make 10x gene matrix and perform cluster comparison

In [3]:
import anndata
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as mpatches
import scanpy as scanp
from scipy.stats import ks_2samp, ttest_ind
from scipy import stats
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from matplotlib import cm

import sys
sys.path.append('/home/sina/projects/mop/BYVSTZP_2020/trackfig')
from trackfig.utils import get_notebook_name
from trackfig.trackfig import trackfig 

TRACKFIG = "/home/sina/projects/mop/BYVSTZP_2020/trackfig.txt"
NB = get_notebook_name()

import warnings
warnings.filterwarnings('ignore')

fsize=20

plt.rcParams.update({'font.size': fsize})
%config InlineBackend.figure_format = 'retina'

In [4]:
cluster_cmap = {
"Astro": (0.38823529411764707, 0.4745098039215686,  0.2235294117647059 ),  # 637939,
"Endo" : (0.5490196078431373,  0.6352941176470588,  0.3215686274509804 ),  # 8ca252,
"SMC"  : (0.7098039215686275,  0.8117647058823529,  0.4196078431372549 ),  # b5cf6b,
"VLMC" : (0.807843137254902,   0.8588235294117647,  0.611764705882353  ),  # cedb9c,
"Low Quality" : (0,0,0),
"L2/3 IT" : (0.9921568627450981,  0.6823529411764706,  0.4196078431372549  ),  # fdae6b
"L5 PT" : (0.9921568627450981,  0.8156862745098039,  0.6352941176470588  ),  # fdd0a2
"L5 IT" : (0.5176470588235295,  0.23529411764705882, 0.2235294117647059 ),  # 843c39
"L5/6 NP": "#D43F3A",
"L6 CT" : (0.8392156862745098,  0.3803921568627451,  0.4196078431372549 ),  # d6616b
"L6 IT" : (0.9058823529411765,  0.5882352941176471,  0.611764705882353  ),  # e7969c
"L6b" : (1.0,                 0.4980392156862745,  0.054901960784313725),  # ff7f0e
"L6 IT Car3" : (1.0,                 0.7333333333333333,  0.47058823529411764 ),  # ffbb78
"Lamp5" : (0.19215686274509805, 0.5098039215686274,  0.7411764705882353  ),  # 3182bd # blues
"Sncg" : (0.4196078431372549,  0.6823529411764706,  0.8392156862745098  ),  # 6baed6
"Vip" : (0.6196078431372549,  0.792156862745098,   0.8823529411764706  ),  # 9ecae1
"Sst" : (0.7764705882352941,  0.8588235294117647,  0.9372549019607843  ),  # c6dbef
"Pvalb":(0.7372549019607844,  0.7411764705882353,  0.8627450980392157  ),  # bcbddc
}

In [None]:
path = "../../data/10xv3"
file_paths = glob.glob(path + "/gene_out*/counts_filtered/adata.h5ad")

In [None]:
file_paths = np.sort(file_paths).tolist()

In [None]:
file_paths

In [None]:
batch_id = [
"3L8TX_181211_01_A01",
"4L8TX_181211_01_B01",
"5L8TX_181211_01_C01",
"6L8TX_181211_01_D01",
"7L8TX_181211_01_E01",
"8L8TX_181211_01_F01",
"1L8TX_181211_01_G12",
"2L8TX_181211_01_H12",
"9L8TX_190430_01_A08",    
"10L8TX_190430_01_B08",
"11L8TX_190430_01_F08",    
"12L8TX_190430_01_G08"]

In [None]:
%%time
files = []
for fidx, f in enumerate(file_paths):
    print("{} of {}".format(fidx+1, len(file_paths)))
    files.append(anndata.read_h5ad(f))

In [None]:
adata = files[0].concatenate(*files[1:], batch_categories=batch_id)

In [None]:
adata

In [None]:
adata.obs.head()

In [None]:
ca = pd.read_csv("../../reference/10xv3_cluster_labels/cluster.annotation.csv", index_col=0)
cm = pd.read_csv("../../reference/10xv3_cluster_labels/cluster.membership.csv", index_col=0)

In [None]:
adata = adata[adata.obs.index.isin(cm.index)]

In [None]:
adata.obs["cluster_id"] = adata.obs.index.map(cm["x"])

In [None]:
adata.obs["cluster_label"]  = adata.obs["cluster_id"].map(ca["cluster_label"])
adata.obs["subclass_label"] = adata.obs["cluster_id"].map(ca["subclass_label"])
adata.obs["class_label"]    = adata.obs["cluster_id"].map(ca["class_label"])

In [None]:
le = LabelEncoder()
adata.obs["cluster_id"] = le.fit_transform(adata.obs.cluster_label.values)

le = LabelEncoder()
adata.obs["subclass_id"] = le.fit_transform(adata.obs.subclass_label.values)

le = LabelEncoder()
adata.obs["class_id"] = le.fit_transform(adata.obs.class_label.values)

In [None]:
adata.obs.cluster_label.nunique()

In [None]:
adata

In [None]:
adata.var["gene_id"] = adata.var.index.str.slice(0,18).values
adata.var["gene_short_name"] = adata.var["gene_name"]
adata.var["gene_name"] = adata.var["gene_short_name"].astype(str) + "_"+ adata.var['gene_id'].astype(str)

In [None]:
adata.var.index = adata.var["gene_name"].values

In [None]:
def change(x):
    if x=="L5 ET": return "L5 PT"
    return x

In [None]:
adata.obs.subclass_label.value_counts()

In [None]:
adata.obs.cluster_label = adata.obs.cluster_label.apply(change).values
adata.obs.subclass_label = adata.obs.subclass_label.apply(change).values

In [None]:
adata.obs.subclass_label.value_counts()

In [None]:
adata.write_h5ad("../../data/notebook/revision/10xv3_gene.h5ad")

In [None]:
adata