In [5]:
import os
import glob
from pathlib import Path
from typing import Optional, List

import torch
import pandas as pd
# from .sae.inference import load_sae
import sys
# if '/maiziezhou_lab2/yunfei/Projects/interpTFM/sae' not in sys.path:
sys.path.insert(0, '/maiziezhou_lab2/yunfei/Projects/interpTFM')
from sae.inference import load_sae

# ------------------------------
# Config
# ------------------------------
GENE_IDS_ROOT = "/maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/gene_ids"
ACTS_ROOT     = "/maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4"
OUT_FILENAME  = "sae_latents.parquet"     # per-shard output
ACTS_FILENAME = "activations.pt"          # per-shard input
META_FILENAME = "cell_gene_pairs.txt"     # per-shard metadata (cell_id, gene_id)

# SAE checkpoint
CKPT_PATH     = "/maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer4/ae_normalized.pt"

# Encoding options
BATCH_SIZE    = 4096
CLS_ONLY      = False      # True => encode only <cls> token rows per shard
SKIP_EXISTING = True       # True => skip shard if OUT_FILENAME exists


# ------------------------------
# Helpers
# ------------------------------
def find_shards(root: str) -> List[Path]:
    return sorted([Path(p) for p in glob.glob(os.path.join(root, "shard_*")) if os.path.isdir(p)])

def read_meta(path: Path) -> pd.DataFrame:
    # Assumes 2 columns: cell_id, gene_id (TSV or whitespace)
    try:
        df = pd.read_csv(path, sep="\t", header=None, names=["cell_id", "gene_id"])
    except Exception:
        df = pd.read_csv(path, sep=r"\s+", header=None, names=["cell_id", "gene_id"])
    return df

def pick_cls_indices(meta_df: pd.DataFrame) -> torch.Tensor:
    # accept multiple possible <cls> spellings
    cls_mask = meta_df["gene_id"].astype(str).isin({"<cls>", "<CLS>", "CLS", "cls"})
    return torch.tensor(cls_mask.values.nonzero()[0], dtype=torch.long)

@torch.no_grad()
def sae_encode_batched(X: torch.Tensor, model: torch.nn.Module, batch_size: int, device: torch.device) -> torch.Tensor:
    outs = []
    for i in range(0, X.shape[0], batch_size):
        xb = X[i:i+batch_size].to(device, non_blocking=True)
        # Try common SAE interfaces
        if hasattr(model, "encode"):
            z = model.encode(xb)
        elif hasattr(model, "encoder"):
            z = model.encoder(xb)
        else:
            out = model(xb)
            if isinstance(out, (tuple, list)) and len(out) >= 2:
                # heuristically pick the non-input-dim output as latents
                z = out[0] if out[0].ndim == 2 and out[0].shape[1] != xb.shape[1] else out[1]
            else:
                raise RuntimeError("SAE object lacks encode/encoder; customize sae_encode_batched for your model.")
        outs.append(z.detach().cpu())
    return torch.cat(outs, dim=0)


# ------------------------------
# Main
# ------------------------------
def encode_all_shards(
    gene_ids_root: str = GENE_IDS_ROOT,
    acts_root: str = ACTS_ROOT,
    ckpt_path: str = CKPT_PATH,
    acts_filename: str = ACTS_FILENAME,
    meta_filename: str = META_FILENAME,
    out_filename: str = OUT_FILENAME,
    batch_size: int = BATCH_SIZE,
    cls_only: bool = CLS_ONLY,
    skip_existing: bool = SKIP_EXISTING,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO] loading SAE from {ckpt_path} on {device} ...")
    sae = load_sae(ckpt_path).to(device).eval()

    gid_shards = find_shards(gene_ids_root)
    act_shards = {p.name: p for p in find_shards(acts_root)}
    print(f"[INFO] found {len(gid_shards)} gene_id shards; {len(act_shards)} activation shards.")

    for gid_shard in gid_shards:
        shard_id = gid_shard.name
        acts_dir = act_shards.get(shard_id)
        if acts_dir is None:
            print(f"[WARN] no activations dir for {shard_id}; skipping.")
            continue

        out_path = acts_dir / out_filename
        if skip_existing and out_path.exists():
            print(f"[SKIP] {shard_id}: {out_filename} already exists.")
            continue

        meta_path = gid_shard / meta_filename
        acts_path = acts_dir / acts_filename
        if not meta_path.exists():
            print(f"[WARN] {shard_id}: missing {meta_filename}; skipping.")
            continue
        if not acts_path.exists():
            print(f"[WARN] {shard_id}: missing {acts_filename}; skipping.")
            continue

        print(f"[INFO] {shard_id}: loading meta + activations ...")
        meta_df = read_meta(meta_path)
        X = torch.load(str(acts_path))  # [N_tokens, H]
        if not isinstance(X, torch.Tensor):
            raise TypeError(f"{acts_path} did not load as a torch.Tensor.")
        if meta_df.shape[0] != X.shape[0]:
            raise AssertionError(f"{shard_id}: meta rows ({meta_df.shape[0]}) != activations rows ({X.shape[0]})")

        if cls_only:
            cls_idx = pick_cls_indices(meta_df)
            if cls_idx.numel() == 0:
                print(f"[WARN] {shard_id}: no <cls> tokens; skipping.")
                continue
            # Subselect both meta and activations to <cls> rows
            meta_sub = meta_df.iloc[cls_idx.numpy()].reset_index(drop=True)
            X_sub = X.index_select(0, cls_idx)
        else:
            meta_sub = meta_df.reset_index(drop=True)
            X_sub = X

        print(f"[INFO] {shard_id}: encoding {X_sub.shape[0]} tokens to SAE latents ...")
        Z = sae_encode_batched(X_sub, sae, batch_size=batch_size, device=device)  # [N_rows, K_latents]
        n_latents = Z.shape[1]

        # Build a DataFrame with integer latent columns
        df_out = pd.DataFrame(Z.numpy(), index=meta_sub.index, columns=list(range(n_latents)))
        # Carry along identifiers (nice for debugging / downstream joins)
        df_out.insert(0, "cell_id", meta_sub["cell_id"].values)
        df_out.insert(1, "gene_id", meta_sub["gene_id"].values)

        out_path.parent.mkdir(parents=True, exist_ok=True)
        df_out.to_parquet(out_path)
        print(f"[OK]  {shard_id}: wrote {out_path}  (shape={df_out.shape})")

        # Free tensors early
        del X, X_sub, Z, df_out, meta_df

    print("[DONE] all shards processed.")


if __name__ == "__main__":
    encode_all_shards()


[INFO] loading SAE from /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer4/ae_normalized.pt on cuda ...




[INFO] found 60 gene_id shards; 60 activation shards.
[INFO] shard_0: loading meta + activations ...
[INFO] shard_0: encoding 516880 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_0: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_0/sae_latents.parquet  (shape=(516880, 4098))
[INFO] shard_1: loading meta + activations ...
[INFO] shard_1: encoding 509497 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_1: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_1/sae_latents.parquet  (shape=(509497, 4098))
[INFO] shard_10: loading meta + activations ...
[INFO] shard_10: encoding 510474 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_10: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_10/sae_latents.parquet  (shape=(510474, 4098))
[INFO] shard_11: loading meta + activations ...
[INFO] shard_11: encoding 540830 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_11: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_11/sae_latents.parquet  (shape=(540830, 4098))
[INFO] shard_12: loading meta + activations ...
[INFO] shard_12: encoding 493300 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_12: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_12/sae_latents.parquet  (shape=(493300, 4098))
[INFO] shard_13: loading meta + activations ...
[INFO] shard_13: encoding 511388 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_13: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_13/sae_latents.parquet  (shape=(511388, 4098))
[INFO] shard_14: loading meta + activations ...
[INFO] shard_14: encoding 485480 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_14: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_14/sae_latents.parquet  (shape=(485480, 4098))
[INFO] shard_15: loading meta + activations ...
[INFO] shard_15: encoding 507675 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_15: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_15/sae_latents.parquet  (shape=(507675, 4098))
[INFO] shard_16: loading meta + activations ...
[INFO] shard_16: encoding 523928 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_16: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_16/sae_latents.parquet  (shape=(523928, 4098))
[INFO] shard_17: loading meta + activations ...
[INFO] shard_17: encoding 524528 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_17: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_17/sae_latents.parquet  (shape=(524528, 4098))
[INFO] shard_18: loading meta + activations ...
[INFO] shard_18: encoding 526512 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_18: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_18/sae_latents.parquet  (shape=(526512, 4098))
[INFO] shard_19: loading meta + activations ...
[INFO] shard_19: encoding 471760 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_19: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_19/sae_latents.parquet  (shape=(471760, 4098))
[INFO] shard_2: loading meta + activations ...
[INFO] shard_2: encoding 503503 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_2: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_2/sae_latents.parquet  (shape=(503503, 4098))
[INFO] shard_20: loading meta + activations ...
[INFO] shard_20: encoding 486972 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_20: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_20/sae_latents.parquet  (shape=(486972, 4098))
[INFO] shard_21: loading meta + activations ...
[INFO] shard_21: encoding 536713 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_21: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_21/sae_latents.parquet  (shape=(536713, 4098))
[INFO] shard_22: loading meta + activations ...
[INFO] shard_22: encoding 475694 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_22: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_22/sae_latents.parquet  (shape=(475694, 4098))
[INFO] shard_23: loading meta + activations ...
[INFO] shard_23: encoding 493536 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_23: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_23/sae_latents.parquet  (shape=(493536, 4098))
[INFO] shard_24: loading meta + activations ...
[INFO] shard_24: encoding 503118 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_24: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_24/sae_latents.parquet  (shape=(503118, 4098))
[INFO] shard_25: loading meta + activations ...
[INFO] shard_25: encoding 484104 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_25: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_25/sae_latents.parquet  (shape=(484104, 4098))
[INFO] shard_26: loading meta + activations ...
[INFO] shard_26: encoding 509184 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_26: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_26/sae_latents.parquet  (shape=(509184, 4098))
[INFO] shard_27: loading meta + activations ...
[INFO] shard_27: encoding 522765 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_27: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_27/sae_latents.parquet  (shape=(522765, 4098))
[INFO] shard_28: loading meta + activations ...
[INFO] shard_28: encoding 485037 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_28: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_28/sae_latents.parquet  (shape=(485037, 4098))
[INFO] shard_29: loading meta + activations ...
[INFO] shard_29: encoding 534717 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_29: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_29/sae_latents.parquet  (shape=(534717, 4098))
[INFO] shard_3: loading meta + activations ...
[INFO] shard_3: encoding 505240 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_3: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_3/sae_latents.parquet  (shape=(505240, 4098))
[INFO] shard_30: loading meta + activations ...
[INFO] shard_30: encoding 506260 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_30: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_30/sae_latents.parquet  (shape=(506260, 4098))
[INFO] shard_31: loading meta + activations ...
[INFO] shard_31: encoding 493335 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_31: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_31/sae_latents.parquet  (shape=(493335, 4098))
[INFO] shard_32: loading meta + activations ...
[INFO] shard_32: encoding 527424 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_32: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_32/sae_latents.parquet  (shape=(527424, 4098))
[INFO] shard_33: loading meta + activations ...
[INFO] shard_33: encoding 504031 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_33: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_33/sae_latents.parquet  (shape=(504031, 4098))
[INFO] shard_34: loading meta + activations ...
[INFO] shard_34: encoding 541061 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_34: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_34/sae_latents.parquet  (shape=(541061, 4098))
[INFO] shard_35: loading meta + activations ...
[INFO] shard_35: encoding 531429 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_35: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_35/sae_latents.parquet  (shape=(531429, 4098))
[INFO] shard_36: loading meta + activations ...
[INFO] shard_36: encoding 521926 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_36: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_36/sae_latents.parquet  (shape=(521926, 4098))
[INFO] shard_37: loading meta + activations ...
[INFO] shard_37: encoding 481478 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_37: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_37/sae_latents.parquet  (shape=(481478, 4098))
[INFO] shard_38: loading meta + activations ...
[INFO] shard_38: encoding 516339 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_38: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_38/sae_latents.parquet  (shape=(516339, 4098))
[INFO] shard_39: loading meta + activations ...
[INFO] shard_39: encoding 540709 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_39: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_39/sae_latents.parquet  (shape=(540709, 4098))
[INFO] shard_4: loading meta + activations ...
[INFO] shard_4: encoding 497098 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_4: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_4/sae_latents.parquet  (shape=(497098, 4098))
[INFO] shard_40: loading meta + activations ...
[INFO] shard_40: encoding 536996 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_40: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_40/sae_latents.parquet  (shape=(536996, 4098))
[INFO] shard_41: loading meta + activations ...
[INFO] shard_41: encoding 506980 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_41: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_41/sae_latents.parquet  (shape=(506980, 4098))
[INFO] shard_42: loading meta + activations ...
[INFO] shard_42: encoding 523950 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_42: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_42/sae_latents.parquet  (shape=(523950, 4098))
[INFO] shard_43: loading meta + activations ...
[INFO] shard_43: encoding 512620 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_43: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_43/sae_latents.parquet  (shape=(512620, 4098))
[INFO] shard_44: loading meta + activations ...
[INFO] shard_44: encoding 513318 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_44: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_44/sae_latents.parquet  (shape=(513318, 4098))
[INFO] shard_45: loading meta + activations ...
[INFO] shard_45: encoding 514939 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_45: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_45/sae_latents.parquet  (shape=(514939, 4098))
[INFO] shard_46: loading meta + activations ...
[INFO] shard_46: encoding 516778 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_46: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_46/sae_latents.parquet  (shape=(516778, 4098))
[INFO] shard_47: loading meta + activations ...
[INFO] shard_47: encoding 484124 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_47: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_47/sae_latents.parquet  (shape=(484124, 4098))
[INFO] shard_48: loading meta + activations ...
[INFO] shard_48: encoding 515702 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_48: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_48/sae_latents.parquet  (shape=(515702, 4098))
[INFO] shard_49: loading meta + activations ...
[INFO] shard_49: encoding 534308 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_49: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_49/sae_latents.parquet  (shape=(534308, 4098))
[INFO] shard_5: loading meta + activations ...
[INFO] shard_5: encoding 511852 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_5: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_5/sae_latents.parquet  (shape=(511852, 4098))
[INFO] shard_50: loading meta + activations ...
[INFO] shard_50: encoding 482061 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_50: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_50/sae_latents.parquet  (shape=(482061, 4098))
[INFO] shard_51: loading meta + activations ...
[INFO] shard_51: encoding 484708 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_51: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_51/sae_latents.parquet  (shape=(484708, 4098))
[INFO] shard_52: loading meta + activations ...
[INFO] shard_52: encoding 511112 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_52: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_52/sae_latents.parquet  (shape=(511112, 4098))
[INFO] shard_53: loading meta + activations ...
[INFO] shard_53: encoding 508146 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_53: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_53/sae_latents.parquet  (shape=(508146, 4098))
[INFO] shard_54: loading meta + activations ...
[INFO] shard_54: encoding 516542 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_54: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_54/sae_latents.parquet  (shape=(516542, 4098))
[INFO] shard_55: loading meta + activations ...
[INFO] shard_55: encoding 511688 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_55: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_55/sae_latents.parquet  (shape=(511688, 4098))
[INFO] shard_56: loading meta + activations ...
[INFO] shard_56: encoding 485231 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_56: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_56/sae_latents.parquet  (shape=(485231, 4098))
[INFO] shard_57: loading meta + activations ...
[INFO] shard_57: encoding 530938 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_57: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_57/sae_latents.parquet  (shape=(530938, 4098))
[INFO] shard_58: loading meta + activations ...
[INFO] shard_58: encoding 521344 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_58: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_58/sae_latents.parquet  (shape=(521344, 4098))
[INFO] shard_59: loading meta + activations ...
[INFO] shard_59: encoding 521464 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_59: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_59/sae_latents.parquet  (shape=(521464, 4098))
[INFO] shard_6: loading meta + activations ...
[INFO] shard_6: encoding 482194 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_6: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_6/sae_latents.parquet  (shape=(482194, 4098))
[INFO] shard_7: loading meta + activations ...
[INFO] shard_7: encoding 553151 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_7: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_7/sae_latents.parquet  (shape=(553151, 4098))
[INFO] shard_8: loading meta + activations ...
[INFO] shard_8: encoding 491853 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_8: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_8/sae_latents.parquet  (shape=(491853, 4098))
[INFO] shard_9: loading meta + activations ...
[INFO] shard_9: encoding 492140 tokens to SAE latents ...


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


[OK]  shard_9: wrote /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_4/shard_9/sae_latents.parquet  (shape=(492140, 4098))
[DONE] all shards processed.


In [4]:
sys.path

['/maiziezhou_lab2/yunfei/Projects/interpTFM/sae',
 '/home/huy21/anaconda3/envs/scgpt/lib/python38.zip',
 '/home/huy21/anaconda3/envs/scgpt/lib/python3.8',
 '/home/huy21/anaconda3/envs/scgpt/lib/python3.8/lib-dynload',
 '',
 '/home/huy21/.local/lib/python3.8/site-packages',
 '/home/huy21/anaconda3/envs/scgpt/lib/python3.8/site-packages']