In [1]:
import scanpy as sc
import pandas as pd
from scipy import io

In [3]:
# Step 1: Read the matrix
adata = sc.AnnData(X=io.mmread("../data/raw/yan/E-GEOD-36552.expression_tpm.mtx").T.tocsr())

# Step 2: Read the cell and gene names
with open("../data/raw/yan/E-GEOD-36552.expression_tpm.mtx_cols") as f:
    adata.obs_names = [line.strip() for line in f]

with open("../data/raw/yan/E-GEOD-36552.expression_tpm.mtx_rows") as f:
    adata.var_names = [line.strip() for line in f]

# Ensure gene names are unique
adata.var_names_make_unique()

adata


AnnData object with n_obs × n_vars = 115 × 28549

In [9]:
# metadata for cell_type labeling
metadata = pd.read_csv("../data/raw/yan/ExpDesign-E-GEOD-36552.tsv", sep="\t")

metadata = metadata[["Assay", "Sample Characteristic[cell type]"]]
metadata.columns = ["cell_id", "cell_type"]

len(metadata['cell_type'].unique())

6

In [None]:
# Sanity check for match
print(set(adata.obs_names[:5]))
print(set(metadata["cell_id"][:5]))

{'SRR445721', 'SRR445722', 'SRR445720', 'SRR445718', 'SRR445719'}
{'SRR445721', 'SRR445722', 'SRR445720', 'SRR445718', 'SRR445719'}


In [7]:
# Ensure all are strings
adata.obs_names = adata.obs_names.astype(str)
metadata["cell_id"] = metadata["cell_id"].astype(str)

# Map the cell types
adata.obs["cell_type"] = adata.obs_names.map(metadata.set_index("cell_id")["cell_type"])

adata

AnnData object with n_obs × n_vars = 115 × 28549
    obs: 'cell_type'

In [8]:
# Preprocessing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)

  view_to_actual(adata)


In [9]:

# Save processed file
adata.write("../data/processed/yan.h5ad")
print("✅ Yan dataset processed and saved:", adata.shape)


✅ Yan dataset processed and saved: (115, 8632)
