In [2]:
import pandas as pd
import scanpy as sc
import bz2


In [5]:

# Helper function to load bz2 csv
def load_bz2_csv(file_path):
    with bz2.open(file_path, 'rt') as f:
        df = pd.read_csv(f, index_col=0)
    return df

# Example
# df_d0 = load_bz2_csv("/pscratch/lji226_uksr/DMNN/data/raw/klein/GSM1599495_ES_d0_biorep_techrep1.csv.bz2")
# print(df_d0.shape)

In [9]:
dfs = []

# Load multiple samples
dataset_mouse = {
    "../data/raw/klein/GSM1599494_ES_d0_main.csv": "d0",
    # "../data/raw/klein/GSM1599495_ES_d0_biorep_techrep1.csv.bz2": "d0_rep1",
    # "../data/raw/klein/GSM1599496_ES_d0_biorep_techrep2.csv.bz2": "d0_rep2",
    "../data/raw/klein/GSM1599497_ES_d2_LIFminus.csv": "d2",
    "../data/raw/klein/GSM1599498_ES_d4_LIFminus.csv": "d4",
    "../data/raw/klein/GSM1599499_ES_d7_LIFminus.csv": "d7",
}

dataset_human = {
    "../data/raw/klein/GSM1599500_K562_cells.csv": "K562"
}

klein_files = {
    "d0": ["GSM1599494_ES_d0_main.csv"],
    "d2": ["GSM1599497_ES_d2_LIFminus.csv"],
    "d4": ["GSM1599498_ES_d4_LIFminus.csv"],
    "d7": ["GSM1599499_ES_d7_LIFminus.csv"]
}


In [84]:
adata_list = []

# Merge
for fname, label in dataset_mouse.items():
    # with bz2.open(fname, "rt") as f:
    #     df = pd.read_csv(f, index_col=0)
    #df = load_bz2_csv(fname)
    
    df = pd.read_csv(fname, index_col=0)
    # Transpose to cells x genes
    df = df.T
    adata = sc.AnnData(df)
    #adata.obs['stage'] = label
    
    # # Set unique cell names
    adata.obs_names = [f"{label}_{i}" for i in range(adata.shape[0])]
    adata_list.append(adata)

adata_list


[AnnData object with n_obs × n_vars = 933 × 24174,
 AnnData object with n_obs × n_vars = 303 × 24174,
 AnnData object with n_obs × n_vars = 683 × 24174,
 AnnData object with n_obs × n_vars = 798 × 24174]

In [100]:
# Merge all samples
adata = adata_list[0].concatenate(adata_list[1:], batch_key="batch", batch_categories=list(dataset_mouse.values()))


# Step 4: Rename cells
#adata.columns = [f"cell{i+1}" for i in range(adata.shape[1])]


  adata = adata_list[0].concatenate(adata_list[1:], batch_key="batch", batch_categories=list(dataset_mouse.values()))


In [101]:

adata.obs.index

Index(['d0_0-d0', 'd0_1-d0', 'd0_2-d0', 'd0_3-d0', 'd0_4-d0', 'd0_5-d0',
       'd0_6-d0', 'd0_7-d0', 'd0_8-d0', 'd0_9-d0',
       ...
       'd7_788-d7', 'd7_789-d7', 'd7_790-d7', 'd7_791-d7', 'd7_792-d7',
       'd7_793-d7', 'd7_794-d7', 'd7_795-d7', 'd7_796-d7', 'd7_797-d7'],
      dtype='object', length=2717)

In [104]:
adata.obs.drop(columns=["batch"],inplace=True)

In [105]:
adata.obs

d0_0-d0
d0_1-d0
d0_2-d0
d0_3-d0
d0_4-d0
...
d7_793-d7
d7_794-d7
d7_795-d7
d7_796-d7
d7_797-d7


In [None]:
# alternate

import numpy as np
from pathlib import Path

# Step 1: Read the CSVs
klein_path = "../data/raw/klein"  # update if needed
d0 = pd.read_csv(f"{klein_path}/GSM1599494_ES_d0_main.csv", header=None)
d2 = pd.read_csv(f"{klein_path}/GSM1599497_ES_d2_LIFminus.csv", header=None)
d4 = pd.read_csv(f"{klein_path}/GSM1599498_ES_d4_LIFminus.csv", header=None)
d7 = pd.read_csv(f"{klein_path}/GSM1599499_ES_d7_LIFminus.csv", header=None)

# Step 2: Merge like in R: keep genes from d0, add only expression columns from d2-d7
d_all = pd.concat([
    d0,
    d2.iloc[:, 1:],
    d4.iloc[:, 1:],
    d7.iloc[:, 1:]
], axis=1)

# Step 3: Set gene names as index
d_all.index = d_all.iloc[:, 0]
d_all = d_all.iloc[:, 1:]

# Step 4: Rename cells
d_all.columns = [f"cell{i+1}" for i in range(d_all.shape[1])]

# Step 5: Create obs (annotation for each cell)
ann = pd.DataFrame({
    "stage": (
        ["d0"] * (d0.shape[1] - 1) +
        ["d2"] * (d2.shape[1] - 1) +
        ["d4"] * (d4.shape[1] - 1) +
        ["d7"] * (d7.shape[1] - 1)
    )
}, index=d_all.columns)

# Step 6: Create AnnData object
adata = sc.AnnData(X=d_all.transpose(), obs=ann)

# Step 7: Make gene names unique and save
adata.obs_names_make_unique()



In [78]:
adata

AnnData object with n_obs × n_vars = 2717 × 24175
    obs: 'stage'

In [58]:
# Preprocessing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)

  view_to_actual(adata)


In [106]:
# Save
adata.write("../data/processed/klein_mouse.h5ad")
print("✅ Klein dataset processed:", adata.shape)


✅ Klein dataset processed: (2717, 24174)


In [36]:
# Human

df = load_bz2_csv("/pscratch/lji226_uksr/DMNN/data/raw/klein/GSM1599500_K562_cells.csv.bz2")

df = df.T
df['stage'] = "K562"

adata = []
adata = sc.AnnData(df.drop(columns='stage'))
adata.obs['stage'] = "K562"
adata

AnnData object with n_obs × n_vars = 239 × 25435
    obs: 'stage'

In [None]:
# Basic filtering and processing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)


  view_to_actual(adata)


In [38]:

# Save processed file
adata.write("../data/processed/klein_human.h5ad")
print("✅ Klein Human dataset processed and saved:", adata.shape)

✅ Klein Human dataset processed and saved: (239, 6101)
