# Clustering the subsampled 1.3 M cells

The data consists in 20K Neurons, downsampled from *1.3 Million Brain Cells from E18 Mice* and is freely available from 10x Genomics ([here](http://cf.10xgenomics.com/samples/cell-exp/1.3.0/1M_neurons/1M_neurons_neuron20k.h5)).

In [1]:
import numpy as np
import scanpy.api as sc
import os
import velocyto as vcy
import loompy
import scanpy
import re
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=70)  # dots (pixels) per inch determine size of inline figures
sc.logging.print_versions()


scanpy==1.1a1+131.gb09db5e anndata==0.6.5 numpy==1.14.2 scipy==1.0.1 pandas==0.22.0 scikit-learn==0.19.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [None]:
ds= loompy.connect(os.path.expanduser('~/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Out.loom'))
row_attrs = dict(ds.row_attrs.items())
col_attrs = dict(ds.col_attrs.items())
print(col_attrs)
col_attrs['CellID'] = [re.sub("x", "", x) for x in col_attrs['CellID']]
col_attrs['CellID'] = [re.sub("^[a-zA-Z0-9_]+:", "", x) for x in col_attrs['CellID']]

adata.obs_names = [re.sub("-[0-9]", "", x) for x in adata.obs_names]
#print(adata.obs_names)
gene_names = [gene for gene in row_attrs['Gene'] if gene in adata.var_names]
cell_names = [cell for cell in col_attrs['CellID'] if cell in adata.obs_names]

# subset the s and u matrices to the genes in adata
from anndata.base import _normalize_index
gene_index = _normalize_index(gene_names, adata.var_names)
cell_index = _normalize_index(cell_names, adata.obs_names)


In [None]:
#adata=sc.read_loom(velocityFile)
adata = sc.read_10x_h5('/home/mt/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/outs/filtered_gene_bc_matrices_h5.h5','refdata-celranger-Pabe2-toplevel')
adata=sc.tl.rna_velocity(adata,os.path.expanduser('~/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Out.loom'))

In [None]:
vars(adata)
scanpy.utils

In [None]:
downSampled.U

Run standard preprocessing steps, see [here](https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.recipe_zheng17.html#scanpy.api.pp.recipe_zheng17).

In [None]:
n_top_genes=3000
log=False

sc.pp.filter_genes(adata, min_counts=1)  # only consider genes with more than 1 count
sc.pp.normalize_per_cell(                # normalize with total UMI count per cell
     adata, key_n_counts='n_counts_all')
filter_result = sc.pp.filter_genes_dispersion(  # select highly-variable genes
    adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=log)
adata = adata[:, filter_result.gene_subset]     # subset the genes
sc.pp.normalize_per_cell(adata)          # renormalize after filtering
if log: sc.pp.log1p(adata)               # log transform: adata.X = log(adata.X + 1)
sc.pp.scale(adata)# scale to unit variance and shift to zero mean


In [None]:
sc.pp.pca(adata)
sc.tl.tsne(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.louvain(adata)

In [None]:
print(adata.obs_keys)
sc.pl.tsne(adata, color='louvain')

In [None]:
velocityFile = os.path.expanduser('~/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Out.loom')
vlm = vcy.VelocytoLoom(velocityFile)

In [None]:
numC=6000
numG=6000

vlm.A=vlm.A[0:numG,0:numC]
vlm.U=vlm.U[0:numG,0:numC]
vlm.S=vlm.S[0:numG,0:numC]
for k in vlm.ca.keys():
    vlm.ca[k]=vlm.ca[k][0:numC]
for k in vlm.ra.keys():
    vlm.ra[k]=vlm.ra[k][0:numG]

vlm.initial_Ucell_size=vlm.initial_Ucell_size[0:numC]
vlm.initial_cell_size=vlm.initial_cell_size[0:numC]

for x in vars(vlm):
    print(x)
    print(getattr(vlm,x))

vlm.to_hdf5(os.path.expanduser('~/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Subsample.hdf5'))

In [None]:
downSampled= vcy.load_velocyto_hdf5(os.path.expanduser('~/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Subsample.hdf5'))

In [None]:
vars(downSampled)
#vlm.plot_fractions()
#'TBR1' in .ra['Gene']


In [None]:
downSampled.normalize("S", size=True, log=True)
downSampled.S_norm  # contains log normalized
downSampled.filter_cells(bool_array=downSampled.initial_Ucell_size > np.percentile(downSampled.initial_Ucell_size, 0.5))
#adata=sc.read_h5ad(os.path.expanduser('~/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Subsample.hdf5'))

#downSampled.set_clusters(downSampled.ca["louvain"])
downSampled.score_detection_levels(min_expr_counts=10, min_cells_express=10)
downSampled.filter_genes(by_detection_levels=True)
downSampled.score_cv_vs_mean(20, plot=True, max_expr_avg=15)
downSampled.filter_genes(by_cv_vs_mean=True)
downSampled._normalize_S(relative_size=downSampled.S.sum(0),
             target_size=downSampled.S.sum(0).mean())
downSampled._normalize_U(relative_size=downSampled.U.sum(0),
             target_size=downSampled.U.sum(0).mean())
downSampled.perform_PCA()
downSampled.knn_imputation(n_pca_dims=20, k=500, balanced=True, b_sight=3000, b_maxl=1500, n_jobs=16)
downSampled.fit_gammas()
downSampled.plot_phase_portraits(["SOX2", "TBR1"])

