In [1]:
import pandas as pd
import anndata
from anndata import AnnData
import scanpy as sc
import matplotlib.pyplot as plt
from os.path import join
import numpy as np
import glob

In [2]:
raw_dir = "raw/habib_2016/"

In [3]:
df_mat = pd.read_csv(join(raw_dir, "SCP1/expression/DATA_MATRIX_LOG_TPM.txt"), sep='\t', index_col=0)
df_mat = np.exp(df_mat) - 1
df_meta = pd.read_csv(join(raw_dir, "SCP1/metadata/CLUSTER_AND_SUBCLUSTER_INDEX.txt"), sep='\t').iloc[1:, :].set_index("NAME")
assert np.all(df_meta.index == df_mat.columns)
adata = AnnData(df_mat.T, obs=df_meta)

for f in glob.glob(join(raw_dir, "SCP1/cluster/*")):
    df = pd.read_csv(f, sep='\t').iloc[1:, :].set_index("NAME")
    col_name = f.split("/")[-1].split('.')[0]
    adata.obs[[col_name + "_X", col_name + "_Y"]] = df.reindex(adata.obs.index).astype(float)
    
adata.write_h5ad("processed/habib_2016.raw.h5ad")

... storing 'CLUSTER' as categorical
... storing 'SUB-CLUSTER' as categorical


In [None]:
adata = sc.read_h5ad("processed/habib_2016.raw.h5ad")
    
sc.pp.filter_cells(adata, min_genes=0)
sc.pp.filter_genes(adata, min_cells=0)
adata.raw = adata

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highest_expr_genes(adata, n_top=20)
adata = adata[:, adata.var.highly_variable]


sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.set_figure_params(dpi=150)
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.tl.leiden(adata)

sc.pl.umap(adata, color=["CLUSTER", "SUB-CLUSTER"], ncols=1)


raw_adata = adata.raw.to_adata()
raw_adata.write_h5ad("processed/habib_2016.processed.h5ad")