# Lung metastatic tumors

In [None]:
#load packages
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import scanorama
import seaborn as sb
import scanpy.external as sce
import scrublet as scr
import doubletdetection

%matplotlib inline

In [None]:
sc.settings.verbosity = 3      
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

## Preprocessing

In [None]:
#load data from GEO (change file path as appropriate)
#vehicle condition (V)
V_1 = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/V1_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

V_2a = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/V2a_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

V_2b = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/V2b_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

V_2c = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/V2c_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

V_4a = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/V4a_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)       

In [None]:
#Entinostat (E)
E_2 = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/E2_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

E_3a = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/E3a_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

E_3b = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/E3b_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

E_4a = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/E4a_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

In [None]:
#E + aPD1 (EP)
EP_1 = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EP1_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

EP_2a = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EP2a_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

EP_2b = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EP2b_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

EP_3b = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EP3b_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

In [None]:
#E + aCTLA4 (EC)
EC_1 = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EC1_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

EC_3a = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EC3a_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

EC_4b = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EC4b_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

EC_4c = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EC4c_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

In [None]:
#EPC
EPC_1 = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EPC1_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

EPC_3b = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EPC3b_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

EPC_4a = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/EPC4a_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

In [None]:
#PC
PC_1 = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/PC1_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

PC_2a = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/PC2a_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

PC_2b = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/PC2b_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

PC_4a = sc.read_10x_mtx(
    '/Volumes/passport-external-drive/cellranger_count_output/PC4a_out/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

In [None]:
V_1.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
V_2a.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
V_2b.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
V_2c.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
V_4a.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [None]:
E_2.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
E_3a.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
E_3b.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
E_4a.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [None]:
EP_1.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
EP_2a.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
EP_2b.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
EP_3b.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [None]:
EC_1.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
EC_3a.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
EC_4b.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
EC_4c.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [None]:
EPC_1.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
EPC_3b.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
EPC_4a.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [None]:
PC_1.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
PC_2a.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
PC_2b.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
PC_4a.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [None]:
#annotate samples
V_1.obs["treatment"] = "V"
V_2a.obs["treatment"] = "V"
V_2b.obs["treatment"] = "V"
V_2c.obs["treatment"] = "V"
V_4a.obs["treatment"] = "V"

In [None]:
E_2.obs["treatment"] = "E"
E_3a.obs["treatment"] = "E"
E_3b.obs["treatment"] = "E"
E_4a.obs["treatment"] = "E"

In [None]:
EP_1.obs["treatment"] = "EP"
EP_2a.obs["treatment"] = "EP"
EP_2b.obs["treatment"] = "EP"
EP_3b.obs["treatment"] = "EP"

In [None]:
EC_1.obs["treatment"] = "EC"
EC_3a.obs["treatment"] = "EC"
EC_4b.obs["treatment"] = "EC"
EC_4c.obs["treatment"] = "EC"

In [None]:
EPC_1.obs["treatment"] = "EPC"
EPC_3b.obs["treatment"] = "EPC"
EPC_4a.obs["treatment"] = "EPC"

In [None]:
PC_1.obs["treatment"] = "PC"
PC_2a.obs["treatment"] = "PC"
PC_2b.obs["treatment"] = "PC"
PC_4a.obs["treatment"] = "PC"

In [None]:
V_1.obs["sample_name"] = "V1"
V_2a.obs["sample_name"] = "V2a"
V_2b.obs["sample_name"] = "V2b"
V_2c.obs["sample_name"] = "V2c"
V_4a.obs["sample_name"] = "V4a"

In [None]:
E_2.obs["sample_name"] = "E2"
E_3a.obs["sample_name"] = "E3a"
E_3b.obs["sample_name"] = "E3b"
E_4a.obs["sample_name"] = "E4a"

In [None]:
EP_1.obs["sample_name"] = "EP1"
EP_2a.obs["sample_name"] = "EP2a"
EP_2b.obs["sample_name"] = "EP2b"
EP_3b.obs["sample_name"] = "EP3b"

In [None]:
EC_1.obs["sample_name"] = "EC1"
EC_3a.obs["sample_name"] = "EC3a"
EC_4b.obs["sample_name"] = "EC4b"
EC_4c.obs["sample_name"] = "EC4c"

In [None]:
EPC_1.obs["sample_name"] = "EPC1"
EPC_3b.obs["sample_name"] = "EPC3b"
EPC_4a.obs["sample_name"] = "EPC4a"

In [None]:
PC_1.obs["sample_name"] = "PC1"
PC_2a.obs["sample_name"] = "PC2a"
PC_2b.obs["sample_name"] = "PC2b"
PC_4a.obs["sample_name"] = "PC4a"

In [None]:
Exp_1 = V_1.concatenate(EP_1, EC_1, EPC_1, PC_1, batch_key='batch')
Exp_2 = V_2a.concatenate(V_2b, V_2c, E_2, EP_2a, EP_2b, PC_2a, PC_2b, batch_key='batch')
Exp_3 = E_3a.concatenate(E_3b, EP_3b, EC_3a, EPC_3b, batch_key='batch')
Exp_4 = V_4a.concatenate(E_4a, EC_4b, EC_4c, EPC_4a, PC_4a, batch_key='batch')

In [None]:
adata = Exp_1.concatenate(Exp_2, Exp_3, Exp_4, batch_key='batch')

In [None]:
new_batch_names = [
    '1', '2',
    '3', '4']
adata.rename_categories('batch', new_batch_names)

In [None]:
#concatenate all data together 
adata

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20)

In [None]:
#doublet detection 
clf = doubletdetection.BoostClassifier(clustering_algorithm="louvain")
doublets = clf.fit(adata.X).predict()
doublet_score = clf.doublet_score()
adata.obs["doublet"] = doublets
adata.obs["doublet_score"] = doublet_score
f = doubletdetection.plot.convergence(clf, show=True)

In [None]:
#basic filtering
min_genes_threshold = 200 # minimum genes threshold
max_genes_threshold = 8000 # maximum genes threshold
min_cells_threshold = 3 # minimum cells threshold

In [None]:
sc.pp.filter_cells(adata, min_genes=min_genes_threshold)
sc.pp.filter_cells(adata, max_genes=max_genes_threshold)
sc.pp.filter_genes(adata, min_cells=min_cells_threshold)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(8,4),constrained_layout=True)
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', ax=axs[0], show=False)
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', ax=axs[1], show=False)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12,4),constrained_layout=True)
sb.distplot(adata.obs['total_counts'], kde=False, bins=60,  ax=axs[0])
sb.distplot(adata.obs['n_genes_by_counts'], kde=False, bins=60,  ax=axs[1])
sb.distplot(adata.obs['pct_counts_mt'], kde=False, bins=100,  ax=axs[2])

In [None]:
genes_by_counts_threshold = 8000 # genes threshold
mt_count_threshold = 15 # mt threshold

In [None]:
#filter 
adata = adata[adata.obs.n_genes_by_counts < genes_by_counts_threshold, :]
adata = adata[adata.obs.pct_counts_mt < mt_count_threshold, :]

In [None]:
target_sum_threshold = 1e4

In [None]:
#normalize
sc.pp.normalize_total(adata, target_sum=target_sum_threshold)

In [None]:
sc.pp.log1p(adata)

In [None]:
threshold_1 = 0.0125
threshold_2 = 3
threshold_3 = 0.5

In [None]:
#highly variable genes
sc.pp.highly_variable_genes(adata, min_mean=threshold_1, max_mean=threshold_2, min_disp=threshold_3)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.raw = adata

In [None]:
adata = adata[:, adata.var.highly_variable]

In [None]:
#regress out total counts and mt percentage
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

Scale each gene to unit variance. Clip values exceeding standard deviation 10. 

In [None]:
sc.pp.scale(adata, max_value=10)

In [None]:
adata

## Batch correction

In [None]:
#create a new object with lognormalized counts
adata_combat = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs = adata.obs)

#store the raw data 
adata_combat.raw = adata_combat

#run combat
sc.pp.combat(adata_combat, key='batch')

In [None]:
sc.pp.highly_variable_genes(adata_combat)
print("Highly variable genes: %d"%sum(adata_combat.var.highly_variable))
sc.pl.highly_variable_genes(adata_combat)

#run pca
sc.pp.pca(adata_combat, n_comps=30, use_highly_variable=True, svd_solver='arpack')

sc.pp.neighbors(adata_combat, n_pcs=30)

sc.tl.umap(adata_combat)
sc.tl.tsne(adata_combat, n_pcs=30)

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(8,8),constrained_layout=True)
sc.pl.tsne(adata_combat, color="batch", title="Combat tsne", ax=axs[0,0], show=False)
sc.pl.umap(adata_combat, color="batch", title="Combat umap", ax=axs[1,0], show=False)

In [None]:
sc.pl.umap(adata_combat, color=["batch", "treatment", "doublet", "doublet_score", "sample_name"])

## Cluster data

In [None]:
adata_combat

In [None]:
sc.pl.umap(adata_combat, color=["batch", "treatment", "doublet", "doublet_score", "sample_name"])

In [None]:
threshold_1 = 0.0125
threshold_2 = 3
threshold_3 = 0.5

In [None]:
sc.pp.highly_variable_genes(adata_combat, min_mean=threshold_1, max_mean=threshold_2, min_disp=threshold_3)

In [None]:
adata_combat = adata_combat[:, adata_combat.var.highly_variable]

In [None]:
adata_combat

In [None]:
#check expression of marker genes
sc.pl.umap(adata_combat, color=['Lcn2', 'Wfdc2'])
sc.pl.umap(adata_combat, color=['Mmp2', 'Col12a1'])
sc.pl.umap(adata_combat, color=['Adgre1', 'Itgam', 'Plac8', 'Cd14', 'Cd84', 'S100a8', 'S100a9', 'Ly6g', 'Ly6c1'])
sc.pl.umap(adata_combat, color=['Adgre1', 'Itgam', 'Plac8', 'Cd14', 'Cd84', 'Tnf', 'Ly6g', 'Ly6c1'])
sc.pl.umap(adata_combat, color=['Cd3e', 'Cd4', 'Foxp3', 'Icos', 'Il2ra', 'Ctla4', 'Pdcd1'])
sc.pl.umap(adata_combat, color=['Cd3e', 'Cd4'])
sc.pl.umap(adata_combat, color=['Cd3e', 'Cd8a'])
sc.pl.umap(adata_combat, color=['Cd3e', 'Cd19'])
sc.pl.umap(adata_combat, color=['Adgre1', 'Itgam', 'Cx3cr1', 'Ccr2'])
sc.pl.umap(adata_combat, color=['Adgre1', 'Itgam', 'Ccr5', 'Il6', 'Il1b', 'Cd86', 'H2-Ab1'])
sc.pl.umap(adata_combat, color=['Adgre1', 'Itgam', 'Ccr5', 'Mrc1', 'Il10', 'Cd163', 'Arg1'])
sc.pl.umap(adata_combat, color=['Itgax', 'Flt3', 'Itgae', 'Btla', 'H2-Ab1', 'Ccr7'])

In [None]:
#cluster
resolution_number = 0.1 
sc.tl.louvain(adata_combat, resolution = resolution_number, key_added = "louvain")
sc.pl.umap(adata_combat, color=['louvain'])

In [None]:
#find differentially expressed genes
sc.tl.rank_genes_groups(adata_combat, 'louvain', method='wilcoxon')
sc.pl.rank_genes_groups(adata_combat, n_genes=25, sharey=False)

In [None]:
pd.DataFrame(adata_combat.uns['rank_genes_groups']['names']).head(10)

In [None]:
#create a dictionary to map cluster to annotation label
cluster2annotation = {
     '0': 'MDSCs',
     '1': 'T cells',
     '2': 'Mature myeloid',
     '3': 'Endothelial',
     '4': 'Lipofibroblasts',
     '5': 'Cancer',
     '6': 'NK cells',
     '7': 'B cells',
}

# add a new `.obs` column called `cell type` by mapping clusters to annotation using pandas `map` function
adata_combat.obs['clusters'] = adata_combat.obs['louvain'].map(cluster2annotation).astype('category')

In [None]:
sc.pl.umap(adata_combat, color=['louvain', 'clusters'])

In [None]:
marker_genes_dict_ordered = {
    'Cancer': ['Wfdc2', 'Erbb2', 'Col9a1'],
    'Endothelial': ['Ly6c1', 'Pecam1', 'Bsg'],
    'Lipofibroblasts': ['Chil3', 'Ear1', 'Mrc1'],
    'Mature myeloid': ['Csf1r', 'Ccr2', 'Cx3cr1'],
    'MDSCs': ['Wfdc17', 'S100a9', 'Arg2'],
    'T cells': ['Cd3e', 'Icos', 'Cd5'],
    'NK cells': ['Ncr1', 'Gzmb', 'Klrb1c'],
    'B cells': ['Cd79a', 'Ighm', 'Iglc3'],
}

In [None]:
sc.pl.dotplot(adata_combat, marker_genes_dict_ordered, 'clusters', dendrogram=False, swap_axes=True, categories_order = ["Cancer", "Lung", "Lipofibroblasts", "Monocytes/macrophages", "MDSCs", "T cells", "NK cells", "B cells"])