In [42]:
import pandas as pd
import scanpy as sc
import scirpy as ir
import numpy as np
import pathlib
import muon as mu
from muon import prot as pt
import os
import numpy as np
savefig_args = {"dpi": 300, "bbox_inches": "tight", "pad_inches": 0, "transparent": True}
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
overload = False
if overload == True:
    output_dir='figures/overload'
else:
    output_dir = "figures/no_overload"

pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
output_suffix = ""
output_formats = [".png", ".svg"]
sc.settings.figdir = output_dir
sc.set_figure_params(dpi_save = 300)
sc.set_figure_params(format = 'png')
def save_figure(fig, name, output_dir=output_dir, output_suffix=output_suffix, output_formats=output_formats, savefig_args=savefig_args):
    for output_format in output_formats:
        fig.savefig(output_dir + "/" + name + output_suffix + output_format, **savefig_args)
    return None


donor = 'TBD6'
data = 'data'
samples = ['BM', 'PBMC', 'LN', 'SPL', 'BM_Overload', 'PBMC_Overload', 'LN_Overload', 'SPL_Overload']
prots = []
tenX_output = 'filtered'

def load_data(samplenames, data, donor, overload):
    """samplename is a list of samplenames, 
    data is where the data were downloaded
    donor is the name of the donor
    all of these info are used to create the file path
    """
    
    for sample in samples:
        if tenX_output == 'raw':
            datapath = '{}/{}_{}/outs/multi/count/raw_feature_bc_matrix/'.format(data, donor, sample)
            print(datapath)
            prot = mu.read_10x_mtx(datapath)
            prots.append(prot)
            prot['prot'].obs['tissue'] = sample
        if tenX_output == 'filtered':
            datapath = '{}/{}_{}/outs/per_sample_outs/{}/count/sample_feature_bc_matrix/'.format(data, donor, sample, donor ,sample)
            print(datapath)
            prot = mu.read_10x_mtx(datapath)
            prots.append(prot)
            prot['prot'].obs['tissue'] = sample
 
    adatas = []
    for prot in prots:
        adata = prot['prot']
        adatas.append(adata)
    if overload == True:
        analysis = "with_overload"
    else:
        analysis = "without_overload"
    if overload:
    # hard coded number of samples right now
        adata = adatas[0].concatenate(adatas[1], adatas[2], adatas[3], adatas[4], adatas[5], adatas[6], adatas[7])
    else:
        adata = adatas[0].concatenate(adatas[1], adatas[2], adatas[3])
    return adata
    
def perform_qc(adata):    
    # calculate qc metrics
    sc.pp.calculate_qc_metrics(adata, percent_top=False, inplace = True)
    sc.pl.highest_expr_genes(adata,save='prefilter')
    # plot qc metrics
    sc.pl.violin(adata, ['n_genes_by_counts', 'log1p_total_counts', 'total_counts'], stripplot=False, multi_panel=True, save='prefilter')
    # cell filter
    sc.pp.filter_cells(adata, min_counts=1000)
    sc.pp.filter_cells(adata, max_counts=15000)

    # plot results of filtering
    sc.pl.violin(adata, ['n_genes_by_counts', 'log1p_total_counts', 'total_counts'], stripplot=False, multi_panel=True, save='postfilter')
    sc.pl.violin(adata, ['n_genes_by_counts', 'log1p_total_counts', 'total_counts'], stripplot=False, multi_panel=True, rotation = 90, groupby='tissue', save='postfilter_tissue')

    sc.pl.highest_expr_genes(adata, save='postfilter')
    return adata

# create mdata object
adata = load_data(samplenames=samples, data = data, donor=donor, overload=False)

adata = perform_qc(adata)

mdata = mu.MuData({'prot': adata})
prot = mdata['prot']
prot.layers['counts'] = prot.X

# a type of normalization reported to be good for cite-seq
pt.pp.clr(mdata['prot'])
sc.pl.scatter(prot, x="IgM_TotalSeqC", y="CD20_TotalSeqC", color='tissue')
sc.tl.pca(prot)
sc.pl.pca_overview(prot, color = 'tissue', save='pca_')

sc.pl.pca_scatter(prot, color = 'tissue', components=[2,3], size=10, save='tissue')

sc.pl.pca_scatter(prot, color = 'IgM_TotalSeqC', components=[2,3], size=10, save = 'Igm_{}'.format(analysis,tenX_output))

sc.pl.pca_scatter(prot, color = 'CD38_TotalSeqC', components=[2,3], size=10, save = 'CD38')

sc.pl.pca_scatter(prot, color = 'log1p_total_counts', components=[2,3], size=10, save = 'log1p_total_counts')
sc.pp.neighbors(prot)


def kdeplot(prot, pair, ok):
        data = sc.get.obs_df(prot, keys = [pair[0], pair[1], ok])
        data = data.groupby(ok).sample(n = 1000, replace = True)
        # allows resampling the same cell
        data.reset_index(inplace = True)
        g = sns.JointGrid()
        x, y, hue = data[pair[0]], data[pair[1]], data[ok]
        sns.scatterplot(x=x, y=y, hue = hue, palette=palette, s=10, linewidth=0.1, edgecolor = 'k', ax=g.ax_joint)
        sns.kdeplot(x=x, hue=hue, palette=palette, fill=False, linewidth=2, ax=g.ax_marg_x)
        sns.kdeplot(y=y, hue=hue, palette=palette, linewidth=2, fill=False, ax=g.ax_marg_y)
        return g

import itertools
ok = 'tissue'
single_pair = ("CD20_TotalSeqC", True)
palette = {'SPL':'#765760', 'PBMC':'#C1717B', 'BM':'#266967', 'LN':'#E4B363'}
for pair in list(itertools.permutations(prot.var.index, 2)):
    if pair[1] == 'CD20_TotalSeqC':
        g = kdeplot(prot, pair, ok)
                 


outs/per_sample_outs/count/sample_feature_bc_matrix/


FileNotFoundError: Did not find file outs/per_sample_outs/count/sample_feature_bc_matrix/matrix.mtx.gz.

In [None]:

sc.tl.umap(prot, random_state=1)

sc.pl.umap(prot, color="IgM_TotalSeqC", ncols=1, vmax=5, size=10)

sc.pl.umap(prot, color="IgD_TotalSeqC", ncols=1, vmax=5, size=10)

sc.pl.umap(prot, color="tissue", ncols=1, vmax=5, size=10,save ='tissue')
color = "log1p_total_counts"
sc.pl.umap(prot, color="log1p_total_counts", ncols=1, size=10, save = color)
sc.tl.leiden(adata, resolution=0.3)
sc.pl.umap(prot, color=list(prot.var.index.values) + ['tissue', 'leiden'], ncols=3, vmax = 5, save = 'all')

sc.pl.umap(adata, color = 'leiden', save='leiden')
sc.pl.umap(adata, color = 'tissue', save='tissue')

In [7]:
sub_adata.obs.tissue.value_counts()

SPL_Overload     61859
LN_Overload      44456
PBMC_Overload    28107
LN               16800
BM_Overload      16022
SPL              13881
BM                7464
PBMC               676
Name: tissue, dtype: int64

In [16]:
sc.tl.rank_genes_groups(adata,groupby='leiden', use_raw=False,)



In [1]:
sc.pl.rank_genes_groups(adata, save='rank_genes')

NameError: name 'sc' is not defined