In [1]:
%matplotlib inline
import scanpy as sc
import scvi
import matplotlib.pyplot as plt
import pandas as pd
from sctools import integrate, plot, io

Global seed set to 0


## Integration of T-cell subsets for identification of regulatory T-cells
Here we extract the identified T-cell subset from the original dataset and integrate it anew to avoid incorporating artifacts introduced into the embedding by other celltypes we are not interested in.

In [2]:
# load data
prefixes = ['tissue.scps', 'tissue.ad', 'tissue.uc', 'pbmc.scps']
adatas = {}
for key in prefixes:
    adata = io.initialize_from_raw(f'../data/{key}.integrated.clustered.h5ad')
    adatas[key] = adata[adata.obs.coarse_cell_types == 'Tcell'].copy()
    del adata
    
adatas



{'tissue.scps': AnnData object with n_obs × n_vars = 62462 × 20912
     obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score', 'nFeature_RNA', 'nCount_RNA', 'percent_mt', 'percent_ribo', 'qc_pass', '_scvi_batch', '_scvi_labels', 'leiden_scvi_0.1', 'coarse_cell_types',
 'tissue.ad': AnnData object with n_obs × n_vars = 61787 × 21750
     obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score', 'nFeature_RNA', 'nCount_RNA', 'percent_mt', 'percent_ribo', 'qc_pass', '_scvi_batch', '_scvi_labels', 'leiden_scvi_0.25', 'coarse_cell_types',
 'tissue.uc': AnnData object with n_obs × n_vars = 46867 × 18436
     obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score', 'nFeature_RNA', 'nCount_RNA', 'percent_mt', 'percent_ribo', 'qc_pass', '_scvi_batch', '_scvi_labels', 'leiden_scvi_0.4', 'coarse_cell_types',
 'pbmc.scps': AnnData object with n_obs × n_vars = 18557 × 15998
     

In [3]:
# visualize raw data
for k, adata in adatas.items():
    print(k)
    plot.integrate.raw_data_umap(
        adata,
        ['status', 'sample_id', 'FOXP3'],
        size = 10,
        vmax = 0.5,
        savefile = f'../plots/{k}.raw.umap.png'
    )

tissue.scps


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  cax = scatter(
  cax = scatter(


tissue.ad


  cax = scatter(
  cax = scatter(


tissue.uc


  cax = scatter(
  cax = scatter(


pbmc.scps


  cax = scatter(
  cax = scatter(


In [None]:
# integrate tcells
integration_params = {
    f'tcells.{k}': {'kwargs': dict()} for k in adata.keys()
}

integration_results = {}
for key, adata in adatas.items():
    key = f'tcells.{key}'
    params = integration_params[key]
        
    print(key)
    integration_results[key] = integrate.integrate_data_scvi(
        adata.copy(),
        'sample_id',
        train_size = 1,
        **params['kwargs']
    )

    integration_results[key]['data'].write(
        f'../data/{key}.integrated.h5ad'
    )

    integration_results[key]['model'].save(
        f'../data/{key}.integration.scvi.model',
        overwrite = True
    )

In [4]:
# restore results if kernel breaks or gets shut down
integration_results = {}
for key in ['tissue.scps', 'tissue.ad', 'tissue.uc', 'pbmc.scps']:
    print(key)
    data = sc.read_h5ad(
        f'../data/tcells.{key}.integrated.h5ad'
    )
    integration_results[key] = {
        'data': data,
        'model': scvi.model.SCVI.load(
            f'../data/tcells.{key}.integration.scvi.model', 
            adata = data
        )
    }
integration_results

tissue.scps
tissue.ad
tissue.uc
pbmc.scps


{'tissue.scps': {'data': AnnData object with n_obs × n_vars = 62432 × 20912
      obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score', 'nFeature_RNA', 'nCount_RNA', 'percent_mt', 'percent_ribo', 'qc_pass', '_scvi_batch', '_scvi_labels', 'leiden_scvi_0.1', 'coarse_cell_types'
      uns: '_scvi_manager_uuid', '_scvi_uuid', 'neighbors', 'umap'
      obsm: 'X_scvi', 'X_umap'
      layers: 'counts'
      obsp: 'connectivities', 'distances',
  'model': },
 'tissue.ad': {'data': AnnData object with n_obs × n_vars = 61764 × 21750
      obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score', 'nFeature_RNA', 'nCount_RNA', 'percent_mt', 'percent_ribo', 'qc_pass', '_scvi_batch', '_scvi_labels', 'leiden_scvi_0.25', 'coarse_cell_types'
      uns: '_scvi_manager_uuid', '_scvi_uuid', 'neighbors', 'umap'
      obsm: 'X_scvi', 'X_umap'
      layers: 'counts'
      obsp: 'connectivities', 'distances',
  'model': },
 'tis

In [3]:
fig, axs = plot.integrate.plot_integration_results(
    integration_results,
    ['sample_id', 'status', 'FOXP3', 'CD3D'],
    [
        dict(size = 20, vmax = None),
        dict(size = 20, vmax = None),
        dict(size = 20, vmax = 1),
        dict(size = 20, vmax = 5)
    ],
    data_key = 'data',
    legend_off = True
)
fig.savefig('../plots/tcells.integration.results.png')

  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_p

## Treg annotation

In [5]:
fig, axs = plot.integrate.plot_leiden_clusterings(
    integration_results,
    [0.3, 0.4, 0.5, 1],
    data_key = 'data',
    legend_loc = 'on data',
    size = 20
)
fig.savefig('../plots/tcells.cluster.inspect.png')

  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to_plot + '_colors'] = colors_list
  cax = scatter(
  adata.uns[value_to

In [6]:
resolutions = {
    'tcells.tissue.scps': 0.5,
    'tcells.tissue.uc': 1,
    'tcells.tissue.ad': 0.3,
    'tcells.pbmc.scps': 0.5
}
for k, d in integration_results.items():
    resolution = resolutions[k]
    sc.tl.leiden(
        d['data'], 
        key_added = f'leiden_scvi_{resolution}',
        resolution = resolution
    )

KeyError: 'tissue.scps'

In [None]:
cluster_keys = {k: f'leiden_scvi_{r}' for k, r in resolutions.items()}
fig, axs = plot.integrate.plot_clustering_and_expression(
    integration_results,
    cluster_keys,
    ['CD3D', 'FOXP3'],
    [
        dict(size = 20, vmax = None, legend_loc = 'on data'),
        dict(size = 20, vmax = 10),
        dict(size = 20, vmax = 1)
    ],
    data_key = 'data'
)
fig.savefig('../plots/tcells.cluster.results.png')

In [None]:
treg_clusters = {
    'tcells.tissue.scps': '2',
    'tcells.tissue.uc': '2',
    'tcells.tissue.ad': '2',
    'tcells.pbmc.scps': '4'
}

treg_clusters = {
    k: [c for c in cs.split(',')] for k, cs in tcell_clusters.items()
}

In [None]:
for k, d in integration_results.items():
    clustering = cluster_keys[k]
    cluster_to_treg = treg_clusters[k]
    d['data'].obs['t_cell_types'] = d['data'].obs[clustering].apply(
        lambda x: 'Treg' if x in cluster_to_treg[k] else 'other'
    )
    print(k, d['data'].obs.groupby('t_cell_types').count().iloc[:, 0])
    
fig, axs = plot.integrate.plot_clustering_and_expression(
    integration_results,
    {k: 't_cell_types' for k in integration_results.keys()},
    ['FOXP3', 'SAT1'],
    [
        dict(size = 20, vmax = None, legend_loc = 'on data'),
        dict(size = 20, vmax = 1),
        dict(size = 20, vmax = 1)
    ],
    data_key = 'data'
)
fig.savefig('../plots/tcells.cluster.treg.png')

In [12]:
for k, d in integration_results.items():
    d['data'].write(
        f'../data/{k}.integrated.clustered.h5ad'
    )

In [None]:
for k, d in integration_results.items():
    adata = d['data']
    prefix = key.replace('tcells', 'tregs')
    io.write_sc_data(
        adata[adata.obs.t_cell_types == 'Treg', :],
        '../diffexp/{prefix}',
        obs_columns = ['sample_id', 'status']
    )