## scRNAseq: Epithelial-Mesenchymal Transition in Cancer
Dataset from: [Cook and Vanderhyden 2020](https://www.nature.com/articles/s41467-020-16066-2)

Removing contaminant cells in OVCA420 as noted in Cook et al 2020 github

In [None]:
import os
currDir = os.getcwd()
origDir = currDir

In [None]:
currDir

In [None]:
all_runs = ["OVCA420-EGF", "OVCA420-TGFB1", "OVCA420-TNF"]

run_int = 0
run_name = all_runs[run_int]
dataset_name = "3-Cook"

In [None]:
file_name = run_name
run_name

## Run Settings

Import Cook functions

In [None]:
cookFunctionDir = os.path.dirname(os.path.dirname(currDir))+"/3-Cook/Code/"
os.chdir(cookFunctionDir)
%run Cook_functions.ipynb
setDirectories(dataset_name, run_name)

sc.settings.set_figure_params(dpi=150, figsize=[5,5])
plt.rcParams['figure.figsize']=(5,5)

mainDir = currDir
dataDir = os.path.dirname(os.path.dirname(os.path.dirname(mainDir)))+"/RNAvelocity-datafiles/"

Settings for current run

In [None]:
settings_OVCA420 = [0.004, 0.48, [0,1,2]]
settings = [settings_OVCA420, settings_OVCA420, settings_OVCA420]

currSetting = settings[run_int]
mito_cutoff = currSetting[0]
leiden_resolution = currSetting[1]
EMT_clusters_in_order = currSetting[2]

## Part 1: IMPORT DATA, FILTERING, AND NORMALIZATION

In [None]:
# Figure directories
figDir_base = os.path.dirname(os.path.dirname(currDir))+"/3-Cook-realigned/Figures/"
figDir = figDir_base + "1-Filtering and Normalization/"+run_name+"/"
sc.settings.figdir = figDir

Import raw data; only keep forward timepoints

In [None]:
adata = sc.read_h5ad(dataDir+"_h5ad/"+run_name+"/0-Demultiplexed.h5ad")

# Only keep forward timepoints
adata = adata[[x in ['#0_0d', '#1_8h', '#2_1d', '#3_3d', '#4_7d'] for x in adata.obs["Timepoint"]]].copy()
labelDict = {'#0_0d': '0) 0d',
             '#1_8h': '1) 8h',
             '#2_1d': '2) 1d',
             '#3_3d': '3) 3d',
             '#4_7d': '4) 7d'}
adata.obs["Timepoint"] = adata.obs["Timepoint"].replace(to_replace=labelDict)

In [None]:
adata

### Filtering

Note: lots of basic filtering has already been done \
during demultiplexing & categorizing step (MULTIseq)

In [None]:
# ERCC, filter by cell number and gene count
adata = filterData(adata)

In [None]:
# Gene metric plots
sc.pl.violin(adata, ['n_genes', 'total_counts', 'pct_counts_mito_gene'], jitter=0.4, multi_panel=True, save=" - n_genes, n_counts, perc_mito.png")
sc.pl.scatter(adata, x='total_counts', y='n_genes', save=" - n_genes vs total_counts.png")
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito_gene', save=" - pct_counts_mito_gene vs total_counts.png")
sc.pl.scatter(adata, x='total_counts', y='pct_counts_ribo_gene', save=" - pct_counts_ribo_gene vs n_counts.png")
sc.pl.scatter(adata, x='pct_counts_ribo_gene', y='pct_counts_mito_gene', save=" - pct_counts_mito_gene vs pct_counts_ribo_gene.png")

In [None]:
adata.obs["Timepoint"].value_counts()

### Normalization and HiVar

In [None]:
# Total-count normalize to 10,000 counts per cell, natural log
sc.pp.normalize_total(adata, target_sum=1e4)
# Set raw
adata.raw = adata
adata.write(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_allGenes.h5ad")

# Log and scale data
sc.pp.log1p(adata)
adata.raw = adata
# Add cell cycle score
cellCycle_g1S_genes, cellCycle_g2M_genes = cell_cycle_genes(adata)
adata.X = adata.X.astype('<f8')
sc.tl.score_genes_cell_cycle(adata, s_genes=cellCycle_g1S_genes, g2m_genes=cellCycle_g2M_genes, use_raw=False)
adata.X = adata.X.astype('<f4')

# Regress out batch effects with Combat
sc.pp.combat(adata, key="Mix")

# Highly variable genes (note: expects log data)
sc.pp.highly_variable_genes(adata)
adataHiVar = adata[:, adata.var.highly_variable].copy()

# Regress out effects of total counts per cell and percent mito AND cell cycle
sc.pp.regress_out(adataHiVar, ['total_counts','pct_counts_mito_gene', 'S_score', 'G2M_score'])
sc.pp.scale(adataHiVar, max_value=10)
adataHiVar.write(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_hiVarGenes.h5ad")

## Part 2: CLUSTERING

In [None]:
# Figure directories
sc.settings.figdir = figDir_base + "/2-Clustering/"+run_name
dataSumDir = "/Users/meilumcd/Desktop/EMT-in-cancer/3-Cook-realigned/DataSummaries/2-Clustering/"

In [None]:
# Normalized genes only (for downstream processing)
adata_normalized = sc.read_h5ad(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_allGenes.h5ad")
# pandas df with uppercase genes
df_normalized = adata_normalized.to_df()
df_normalized.columns = map(str.upper, df_normalized.columns)

In [None]:
# PCA calculation
sc.tl.pca(adataHiVar, svd_solver='arpack')
sc.pl.pca(adataHiVar, color=["total_counts","n_genes","pct_counts_mito_gene","pct_counts_ribo_gene"], save=" - total_counts, n_genes, perc_mito, perc_ribo.png")
sc.pl.pca(adataHiVar, color=["Timepoint"], save=" - Timepoints.png")

# UMAP calculation
sc.pp.neighbors(adataHiVar)
sc.tl.umap(adataHiVar)

In [None]:
sc.pl.umap(adataHiVar, color=["total_counts","n_genes"], save=" - total_counts, n_genes.png")
sc.pl.umap(adataHiVar, color=["pct_counts_mito_gene","pct_counts_ribo_gene","phase"], save=" - perc_mito, perc_ribo, cell cycle.png")
sc.pl.umap(adataHiVar, color=["Mix"], save=" - batch.png")
sc.pl.umap(adataHiVar, color=["Timepoint"], palette="coolwarm_r", save=" - Timepoint.png")

In [None]:
# Cluster the two cell lines
sc.tl.leiden(adataHiVar, resolution=0.1)
sc.pl.umap(adataHiVar, color="leiden")

In [None]:
# Only keep OVCA420 cells
OVCA420_cells = adataHiVar[adataHiVar.obs.leiden == "0"].obs.index.to_series()

import csv
os.chdir(origDir)
OVCA420_cells.to_csv("_withoutContaminantCellLine_"+run_name+".csv", index=False)