In [1]:
import scanpy as sc
import pandas as pd

In [None]:
# reading the expression matrix
adata = sc.read_10x_mtx(
    "../data/raw/pbmc68k/filtered_matrices_mex/hg19",  # or path to where you extract the files
    var_names='gene_symbols',
    cache=True
)
adata.var_names_make_unique()

adata

#obs are cells
#vars are genes

AnnData object with n_obs × n_vars = 68579 × 32738
    var: 'gene_ids'

In [3]:
# reading the annotation
annotations = pd.read_csv(
    "../data/raw/pbmc68k/68k_pbmc_barcodes_annotation.tsv",
    sep="\t"
)

In [4]:
adata.obs.index

Index(['AAACATACACCCAA-1', 'AAACATACCCCTCA-1', 'AAACATACCGGAGA-1',
       'AAACATACTAACCG-1', 'AAACATACTCTTCA-1', 'AAACATACTGGATC-1',
       'AAACATACTGTCTT-1', 'AAACATACTTCTAC-1', 'AAACATTGCTGCTC-1',
       'AAACATTGCTTCGC-1',
       ...
       'TTTGACTGCTTTAC-8', 'TTTGACTGTATCGG-8', 'TTTGACTGTCGTTT-8',
       'TTTGACTGTGCTAG-8', 'TTTGCATGACACCA-8', 'TTTGCATGAGCCTA-8',
       'TTTGCATGCTAGCA-8', 'TTTGCATGCTGCAA-8', 'TTTGCATGGCTCCT-8',
       'TTTGCATGTGGTAC-8'],
      dtype='object', length=68579)

In [5]:
adata.obs = adata.obs.merge(
    annotations[["barcodes", "celltype"]],
    left_index=True,
    right_on="barcodes",
    how="left"
)


In [6]:
# Rename for consistency
adata.obs.rename(columns={"celltype": "cell_type"}, inplace=True)

adata.obs.index = adata.obs["barcodes"]
adata.obs.drop(columns=["barcodes"], inplace=True)


In [8]:
len(adata.obs['cell_type'].unique())

11

In [72]:
# Basic filtering and processing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)

  view_to_actual(adata)


In [73]:
# Save processed file
adata.write("../data/processed/pbmc68k.h5ad")
print("✅ PBMC68K dataset processed and saved:", adata.shape)


✅ PBMC68K dataset processed and saved: (68551, 1499)
