In [1]:
import scprep
import pandas as pd
import numpy as np
import scprep
import matplotlib.pyplot as plt
import umap
import tasklogger
import phenograph
import graphtools as gt
import louvain
import phate
import sklearn.manifold

  # Remove the CWD from sys.path while we load stuff.


In [5]:
data = scprep.io.load_10X("/home/nnp9/project/podocoryna/results/cellranger/counts/podocoryna_carnea/outs/filtered_feature_bc_matrix")
# data.head()
data.shape #3,259 cells and 41,678 genes

(3259, 41678)

In [None]:
metadata = pd.DataFrame(index = data.index)
scprep.plot.plot_library_size(data,
                              log=False, title='Library Size of Filtered Cells')

In [None]:
# Plot all mitochondrial genes. There is only 1, FYI.
scprep.plot.plot_gene_set_expression(data, exact_word="TRINITY_DN33980_c0_g1",
                                     library_size_normalize=True,
                                     title="Mitochondrial expression before filtering",
                                     cutoff=125)

In [None]:
# ================
# Fill in your chosen cutoff value
cutoff = 125
# ================
data_filt, metadata = scprep.filter.filter_gene_set_expression(
    data, metadata, exact_word="TRINITY_DN33980_c0_g1",
    cutoff=cutoff, keep_cells='below', library_size_normalize=True)
data_filt.shape # Removes 187 cells

In [None]:
# Plot all mitochondrial genes. There are 14, FYI.
scprep.plot.plot_gene_set_expression(data_filt, exact_word="TRINITY_DN33980_c0_g1",
                                     library_size_normalize=True,
                                     title="Mitochondrial expression after filtering")

In [None]:
scprep.plot.histogram(scprep.measure.gene_capture_count(data_filt), log=True,
                      title="Gene capture before filtering",
                     xlabel='# of cells with nonzero expression',
                     ylabel='# of genes')

In [None]:
# ================
# choose a cutoff
cutoff = 20
data_filt = scprep.filter.filter_rare_genes(data_filt, min_cells=cutoff)
# ================

In [None]:
scprep.plot.histogram(scprep.measure.gene_capture_count(data_filt), cutoff=20, log=True,
                      title="Gene capture after filtering",
                     xlabel='# of cells with nonzero expression',
                     ylabel='# of genes')

In [None]:
data_filt.shape
scprep.plot.plot_library_size(data_filt, title='Library size before normalization')

In [None]:
data_norm, library_size = scprep.normalize.library_size_normalize(data_filt, return_library_size=True)

In [None]:
scprep.plot.plot_library_size(data_norm, title='Library size after normalization')
data_filt, metadata['library_size'] = scprep.normalize.library_size_normalize(data_filt, return_library_size=True)

In [None]:
scprep.plot.histogram(data_norm.mean(axis=0), log='y',
                      title="Gene counts before transformation",
                     xlabel='total # of gene counts',
                     ylabel='# of genes')

In [None]:
data_sqrt = scprep.transform.sqrt(data_norm)
scprep.plot.histogram(data_sqrt.mean(axis=0), log='y',
                      title="Gene counts after transformation",
                     xlabel='total # of gene counts',
                     ylabel='# of genes')

In [None]:
data.to_pickle("data.pickle.gz")
metadata.to_pickle("metadata.pickle.gz")
scprep.plot.plot_gene_variability(data_sqrt, percentile=90)

In [None]:
data_hvg = scprep.select.highly_variable_genes(data_sqrt, percentile=90)
data_hvg.shape

In [None]:
# dimensionality reduction for visualization
data_phate = phate.PHATE(verbose=False).fit_transform(data_hvg)

In [None]:
scprep.plot.scatter2d(data_phate, title='Podocoryna Whole Body Jellyfish Cells',
                      ticks=False, label_prefix='PHATE', figsize=(8, 8))

In [None]:
data_pca = scprep.reduce.pca(data_hvg, n_components=100, method='dense')
scprep.plot.scatter2d(data_pca, figsize=(8,8),
                      ticks=False, label_prefix='PC', legend_anchor=(1,1))

In [None]:
tsne_op = sklearn.manifold.TSNE()
data_tsne = tsne_op.fit_transform(data_pca)
scprep.plot.scatter2d(data_tsne,
                      figsize=(8,8), legend_anchor=(1,1),
                      ticks=False, label_prefix='t-SNE')

In [None]:
umap_op = umap.UMAP()
data_umap = umap_op.fit_transform(data_pca)
scprep.plot.scatter2d(data_umap,
                      figsize=(8,8), legend_anchor=(1,1), ticks=False, label_prefix='UMAP')

In [None]:
phenograph_clusters, _, _ = phenograph.cluster(data_pca)
with tasklogger.log_task("KMeans"):
    kmeans_clusters = sklearn.cluster.KMeans(n_clusters=12).fit_predict(data_pca)

G = gt.Graph(data_pca)
G_igraph = G.to_igraph()