In [None]:
!pip install scprep phate magic-impute anndata scanpy

### Imports

In [None]:
import scprep

In [None]:
scprep.io.download.download_google_drive("10FpYcwqXBrG_K0p5Cq936lmH6F9sO70x", 'r_packages.tar.gz')
!tar xzf r_packages.tar.gz  -C /usr/local/lib/R/site-library/ && rm r_packages.tar.gz
!apt-get install -yqq libgsl-dev=2.4+dfsg-6
!pip install --upgrade rpy2
!R -e "BiocManager::install('slingshot')"

In [None]:
import pandas as pd
import numpy as np
import phate
import magic
import tasklogger

import matplotlib.pyplot as plt

import os

# matplotlib settings for Jupyter notebooks only
%matplotlib inline

# Running Slingshot and Diffusion Pseudotime on EB data

In the last exercise, we ran both pseudotime methods on a relatively small dataset of fewer than 400 cells. Here, we're going to run both algorithms on a more complex dataset of human embryonic stem cells (hESCs) grown as Embryoid Bodies (EBs).

However, 16,000 cells is still a large dataset for slingshot. To save time, we're going to subsample the data to 6,000 cells.

## 1. Load EB Data (and download if needed)



In [None]:
download_dir = os.path.expanduser('~')
file_path = os.path.join(download_dir, 'EBT_counts.pkl.gz')
if not os.path.exists(file_path):
    scprep.io.download.download_google_drive(id='1Xz0ONnRWp2MLC_R6r74MzNwaZ4DkQPcM',
                                             destination=file_path)
data = scprep.utils.SparseDataFrame(pd.read_pickle(file_path))

#### Subsample the data

In [None]:
np.random.seed(42)
data = scprep.select.subsample(data, n=6000)

In [None]:
data.head()

#### Parse the index names to grab the sample information

In [None]:
sample = [ix.split('_')[1] for ix in data.index]
metadata= pd.DataFrame(sample, index=data.index, columns=['sample'])
metadata.head()

### Running PHATE



In [None]:
phate_op = phate.PHATE()

data_phate = phate_op.fit_transform(data)
data_phate = pd.DataFrame(data_phate, index=data.index)

clusters = phate.cluster.kmeans(phate_op, n_clusters=12)
metadata['clusters_for_slingshot'] = scprep.utils.sort_clusters_by_values(clusters, -data_phate.iloc[:,0])

In [None]:
scprep.plot.scatter2d(data_phate, c=metadata['clusters_for_slingshot'], legend_anchor=(1,1))

## 2. Running trajectory inference methods

### Running Slingshot

Here we'll use default parameters for Slingshot, but set the start cluster.

In [None]:
# =======
# What is the the command to run Slingshot using `scprep`?
# Make sure to set the `start_cluster`
with tasklogger.log_task("Slingshot"):
    results = scprep.run.Slingshot(data_phate, metadata['clusters_for_slingshot'], start_cluster = )
# =======

In [None]:
ax = scprep.plot.scatter2d(data_phate, c = metadata['clusters_for_slingshot'], legend=False
                           ticks=False, label_prefix="PHATE")

for curve in results['curves']:
    ax.plot(curve[:,0], curve[:,1], c='k')

### Running Diffusion Pseudotime

Here, we're going to use the diffusion pseudotime implementation from the original authors as provided in `scanpy`. This is faster than the implementation we derived in the previous exercise, but the results are comparable.

You will need to select the root cell using one of the methods previously described and select the number of branching events you think DPT should identify

In [None]:
import anndata, scanpy

# Create AnnData object
adata = anndata.AnnData(data)

# ========
# Select the root
adata.uns['iroot'] = ???
# ========

with tasklogger.log_task("Diffusion Pseudotime"):
    # Run PCA
    scanpy.pp.pca(adata)

    # Identify nearest neighbors
    scanpy.pp.neighbors(adata)

    # Create the diffusion map
    scanpy.tl.diffmap(adata)

    # =======
    # Run Diffusion Pseudotime, select a number of branchings to detect
    scanpy.tl.dpt(adata, n_branchings=???)
    # =======

metadata['dpt'] = adata.obs['dpt_pseudotime']
metadata['dpt_branch'] = adata.obs['dpt_groups'].astype(int)
metadata.head()

### Plotting diffusion pseudotime

In two plots, visualize both the `dpt_branch` and `dpt` metadata values.

In [None]:
# =======
# Write code to plot the `dpt_branch` output


# =======



In [None]:
# =======
# Write code to plot the `dpt` output



# =======



### Discussion

With your groups:

1. Rerun DPT, but with a different number of branchings
2. How many branchings do produces a "good" set of trajectories?
3. How does this compare to the number of branchings observed in Slingshot?
4. Try setting `end_cluster` in Slingshot, how does this affect the output?

## Comparing the resolution of DPT and Slingshot

Here, we're going to dive deeper into how well DPT and Slingshot resolve later developmental transitions.

#### Rename the slingshot branches to be neater and concatenate them to `metadata`

In [None]:
results['pseudotime'].columns = ['slingshot{}'.format(i+1) for i in range(results['pseudotime'].shape[1])]

In [None]:
metadata = pd.concat((metadata, results['pseudotime']), axis=1)
metadata.head()

#### Plotting Slingshot pseudotime vs DPT

Try this with each of the Slingshot branches.

In [None]:
# ========
# Get all the cells that are on the desired branch.
curr_branch = ???
# ========

# create a mask to hide all the cells not on this branch
mask = np.isfinite(metadata[curr_branch])

# Create a scatter plot with slingshot's `curr_branch` on the x-axis
# Because we only want to plot the cells that are not null,
# we can pass the mask created above to the `mask` arguement of
# `scprep.plot.scatter()`
scprep.plot.scatter(metadata[curr_branch], metadata['dpt'],
                    c=metadata['clusters_for_slingshot'],
                    mask=mask,
                    title='Slingshot vs. DPT - {}'.format(curr_branch),
                    legend_anchor=(1,1))

#### Plot pseudotime values per cluster

In [None]:
# Get all the cells that are on the desired branch.
branch_number = '1'
curr_branch = 'slingshot{}'.format(branch_number)

mask = np.isfinite(metadata[curr_branch])

fig, axes = plt.subplots(1,2, figsize=(12,5))

scprep.plot.jitter(metadata['clusters_for_slingshot'], metadata[curr_branch],
                   c=metadata['clusters_for_slingshot'], mask=mask,
                   title='Slingshot - Branch {}'.format(branch_number),
                   legend_anchor=(1,1), ax=axes[0])

scprep.plot.jitter(metadata['clusters_for_slingshot'], metadata['dpt'],
                   c=metadata['clusters_for_slingshot'], mask=mask,
                   title='DPT - Branch {}'.format(branch_number),
                   legend_anchor=(1,1), ax=axes[1])

fig.tight_layout()

### Discussion

In your groups, answer the following questions:

1. Which method does better at finely resolving long-distance pseudotime relationships?
2. Is this different for different branches?
3. Based on what you know about each method, why do you think this is?

## Bonus

If you have extra time, go back to the top of this notebook and change the number of clusters passed to Slingshot. How does this change the output?