In [None]:
%matplotlib inline
import scanpy as sc
import numpy as np
import pandas as pd

import cuml
import scrna

from micron2 import cluster_leiden_cu

from matplotlib import pyplot as plt
from matplotlib import rcParams
rcParams['figure.facecolor'] = (1,1,1,1)

In [None]:
overall_adata = sc.read_h5ad("/storage/codex/datasets_v1/merged_v3.h5ad")
overall_adata

In [None]:
overall_adata.obs.celltype.value_counts()

In [None]:
adata = overall_adata[overall_adata.obs.celltype.isin(['Immune', 'CD4T', 'CD8T'])].copy()
adata

In [None]:
# sc.pp.log1p(adata)
# sc.pp.scale(adata, zero_center=False, max_value=3)
# help(sc.pp.scale)

In [None]:
# include = ['mean', 'percent', 'q01', 'q10', 'q25', 'q50', 'q75', 'q95', 'q99', 'std']
include = ['membrane_mean']
exclude = ['DAPI', 'PanCytoK', 'PDGFRb', 'aSMA', 'IgG', 'IgA', 'C1q', 'GZMB', 'PNaD', 'CD45_', 'CD31',
           'CD80']
features = []
for v in adata.var_names:
    if any([e in v for e in exclude]):
        continue
    if any([i in v for i in include]):
        features.append(v)
print(len(features))

In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

In [None]:
adata = adata[:,features].copy()
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
X = adata[:,features].X.toarray()
# X = MinMaxScaler().fit_transform(X)
# Xextra = np.array(adata.obs.loc[:, adata.obs.columns.str.contains('ring')].values)
# X = np.concatenate([X, Xextra], axis=1)
print(X.shape)

In [None]:
features

In [None]:
# Xpc = cuml.PCA(n_components=30, whiten=False).fit_transform(X)

In [None]:
emb = cuml.UMAP(n_neighbors=30).fit_transform(X)

In [None]:
plt.scatter(emb[:,0], emb[:,1],s=1)
plt.xlim([-30,30])
plt.ylim([-30,30])

In [None]:
clusters = cluster_leiden_cu(X, neighbors=50, nn_metric='cosine', resolution=0.8)

# clusters = cuml.DBSCAN(min_samples=50, eps=0.1, verbose=2).fit_predict(X, out_dtype='int32')
print(len(np.unique(clusters)))

In [None]:
include = ['membrane_mean']

# exclude = ['DAPI', 'PanCytoK', 'PDGFRb', 'aSMA', 'IgG', 'IgA', 'C1q', 'GZMB', 'PNaD', 'CD45_', 'CD31']
exclude = ['DAPI']
show_features = []
for v in adata.var_names:
    if any([e in v for e in exclude]):
        continue
    if any([i in v for i in include]):
        show_features.append(v)
        
print(len(show_features))
adata.obs['leiden'] = pd.Categorical(clusters)
sc.pl.dotplot(adata, show_features, groupby='leiden', standard_scale='var',)
#               expression_cutoff=0.2)
adata.obs.leiden.value_counts().sort_index()

In [None]:
with open('/storage/tmp-outgoing/2021-feb-25.csv', 'w+') as f:
    for c in np.unique(clusters):
        f.write(f'{c},""\n')

In [None]:
labels = pd.read_csv('/storage/tmp-outgoing/2021-feb-25.csv', index_col=0, header=None)
annot = np.zeros(len(clusters), dtype=object)
for c in labels.index:
    annotation = labels.loc[c,1]
    annot[clusters==c] = annotation
for a in np.unique(annot):
    print(a, np.sum(annot==a))

In [None]:
annot.shape

In [None]:
adata.obs['immune_annotation'] = pd.Categorical(annot)

In [None]:
plt.figure(figsize=(r*6,6))
sc.pl.embedding(adata, basis='coordinates_shift', color='immune_annotation', 
                s=1, ax=plt.gca())

In [None]:
from matplotlib import rcParams
import numpy as np
rcParams['figure.dpi'] = 100
r = np.max(np.abs(adata.obsm['coordinates_shift']), axis=0)
r = r[0]/r[1]
plt.figure(figsize=(r*6,6))
sc.pl.embedding(adata, basis='coordinates_shift', color='biopsy', 
                s=1, ax=plt.gca(), legend_loc='on data')

In [None]:
plt.figure(figsize=(r*6,6))
sc.pl.embedding(adata, basis='coordinates_shift', color='sample_id_printing', 
                s=1, ax=plt.gca(), legend_loc='on data')

In [None]:
scrna.plot_group_percents(adata, 'immune_annotation', 'biopsy', sort_by='CD8T_Trm')

In [None]:
# subtype.fillna('x', inplace=True)
subtype = np.array(overall_adata.obs['celltype'])
subtype = pd.DataFrame(subtype, index=overall_adata.obs_names, columns=['subtype'])
subtype.loc[adata.obs_names, 'subtype'] = adata.obs.immune_annotation

overall_adata.obs['subtype'] = subtype
# overall_adata.obs.fillna('x', inplace=True)

In [None]:
plt.figure(figsize=(r*6,6))
sc.pl.embedding(overall_adata, basis='coordinates_shift', color='subtype', 
                s=1, ax=plt.gca())

In [None]:
scrna.plot_group_percents(overall_adata, 'subtype', 'biopsy', sort_by='CD8T_Trm')

In [None]:
scrna.plot_group_percents(overall_adata, 'celltype', 'subtype', 
                          rm_groups=['x'],
                          annotate_total=True)

In [None]:
include = ['membrane_mean']

# exclude = ['DAPI', 'PanCytoK', 'PDGFRb', 'aSMA', 'IgG', 'IgA', 'C1q', 'GZMB', 'PNaD', 'CD45_', 'CD31']
exclude = ['DAPI']
show_features = []
for v in overall_adata.var_names:
    if any([e in v for e in exclude]):
        continue
    if any([i in v for i in include]):
        show_features.append(v)
        
print(len(show_features))
sc.pl.dotplot(overall_adata, show_features, groupby='subtype', standard_scale='var',)
#               expression_cutoff=0.2)

In [None]:
percent_df = pd.DataFrame(index=np.unique(overall_adata.obs.biopsy), 
                          columns=np.unique(overall_adata.obs.subtype))
biopsy_totals = overall_adata.obs.loc[~overall_adata.obs.subtype.isin(['not_a_cell','neg']), 'biopsy'].value_counts()

for s in percent_df.columns:
    z = overall_adata.obs.loc[overall_adata.obs.subtype==s, 'biopsy'].value_counts()
    percent_df.loc[:,s] = z / biopsy_totals
    
percent_df = percent_df.loc[:, ~percent_df.columns.isin(['not_a_cell', 'neg'])]

In [None]:
import seaborn as sns
# sns.clustermap(percent_df.corr())
sns.clustermap(percent_df, standard_scale=1, cmap='Reds', figsize=(5,5),
               yticklabels=True)

In [None]:
sns.clustermap(percent_df.corr(), figsize=(5,5), cmap='RdBu_r', center=0)

In [None]:
overall_adata.write("/storage/codex/datasets_v1/merged_v3.h5ad")