# 02 — Build modeling dataset (DepMap × PRISM)

## Objective

Construct a clean, reproducible modeling dataset by integrating:

- DepMap gene expression profiles (features)  
- PRISM dose–response measurements (AUC as response variable)

This step produces the final dataset that will be used for downstream exploratory analysis and machine learning.

This notebook assumes that identifier mapping and join feasibility were validated in **Notebook 01**.

---

## Inputs

- DepMap RNA-seq expression matrix (log2(TPM + 1))  
- PRISM dose–response secondary screen  

---

## Outputs

- Filtered PRISM response table  
- Matched DepMap expression matrix  
- Drug eligibility index  
- Processed datasets saved under `data/processed/`  

---

## Final gate

This notebook concludes with a formal **GO / NO-GO decision** for downstream modeling.

---

## Imports & global configuration

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import hashlib
import json
from datetime import datetime

### Reproducibility

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

### Project paths

In [3]:
PROJECT_ROOT = Path("..")

DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM = PROJECT_ROOT / "data" / "interim"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

REPORTS_DIR = PROJECT_ROOT / "reports"

# Create output directories if they do not exist
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

### Modeling parameters

In [4]:
# Minimum number of cell lines required per drug to be included
MIN_CELL_LINES_PER_DRUG = 200

# Target column in PRISM
TARGET_COL = "auc"

## File fingerprints (reproducibility)

In [5]:
def compute_sha256(filepath, chunk_size=8192):
    sha256 = hashlib.sha256()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            sha256.update(chunk)
    return sha256.hexdigest()

In [6]:
depmap_expr_path = DATA_RAW / "OmicsExpressionTPMLogp1HumanProteinCodingGenes.csv"
prism_resp_path = DATA_RAW / "prism-repurposing-20q2-secondary-screen-dose-response-curve-parameters.csv"

assert depmap_expr_path.exists(), f"File not found: {depmap_expr_path}"
assert prism_resp_path.exists(), f"File not found: {prism_resp_path}"

In [7]:
fingerprints = {
    "timestamp": datetime.utcnow().isoformat(),
    "depmap_expression": {
        "file": depmap_expr_path.name,
        "sha256": compute_sha256(depmap_expr_path),
    },
    "prism_response": {
        "file": prism_resp_path.name,
        "sha256": compute_sha256(prism_resp_path),
    },
}

fingerprints

{'timestamp': '2026-01-20T12:58:01.072373',
 'depmap_expression': {'file': 'OmicsExpressionTPMLogp1HumanProteinCodingGenes.csv',
  'sha256': '9d7e64ebbcb2811fa5e0ecf56d952ee96ca9f1f2b6ead00bb17480c1c01848d1'},
 'prism_response': {'file': 'prism-repurposing-20q2-secondary-screen-dose-response-curve-parameters.csv',
  'sha256': '2ac69a21f1d681fe7447689262b82ca6e3dc90bfef0bd96eb5479b96f424e43d'}}

In [8]:
fingerprint_path = REPORTS_DIR / "02_file_fingerprints.json"

with open(fingerprint_path, "w") as f:
    json.dump(fingerprints, f, indent=2)

print(f"Fingerprints saved to: {fingerprint_path}")


Fingerprints saved to: ..\reports\02_file_fingerprints.json


## Load PRISM response and DepMap identifiers

In [9]:
prism_cols = ["depmap_id", "broad_id", "name", TARGET_COL]

prism = pd.read_csv(
    prism_resp_path,
    usecols=prism_cols,
)

prism.head()

Unnamed: 0,broad_id,depmap_id,auc,name
0,BRD-K36949735-001-01-1,ACH-000948,0.989373,anlotinib
1,BRD-K36949735-001-01-1,ACH-000011,0.988011,anlotinib
2,BRD-K36949735-001-01-1,ACH-000026,0.958743,anlotinib
3,BRD-K36949735-001-01-1,ACH-000323,0.814224,anlotinib
4,BRD-K36949735-001-01-1,ACH-000905,0.830589,anlotinib


In [None]:
depmap_id_cols = ["ModelID"]

depmap_ids = pd.read_csv(
    depmap_expr_path,
    usecols=depmap_id_cols,
)

depmap_ids.head()

Unnamed: 0,ModelID
0,ACH-001113
1,ACH-001289
2,ACH-001339
3,ACH-001619
4,ACH-001979


## Normalize identifiers and create `join_id`

In [11]:
def normalize_id(x):
    if pd.isna(x):
        return np.nan
    return str(x).strip()

In [12]:
prism["join_id"] = prism["depmap_id"].apply(normalize_id)
depmap_ids["join_id"] = depmap_ids["ModelID"].apply(normalize_id)

In [13]:
print("PRISM join_id unique:", prism["join_id"].nunique(dropna=True))
print("DepMap join_id unique:", depmap_ids["join_id"].nunique(dropna=True))

shared = set(prism["join_id"].dropna()).intersection(set(depmap_ids["join_id"].dropna()))
print("Shared join_id:", len(shared))

PRISM join_id unique: 737
DepMap join_id unique: 1699
Shared join_id: 727


## Filter PRISM to DepMap-mapped cell lines and build drug eligibility index

In [14]:
# Filter PRISM to only include cell lines present in DepMap
depmap_join_set = set(depmap_ids["join_id"].dropna())

prism_mapped = prism[prism["join_id"].isin(depmap_join_set)].copy()

print("PRISM rows (original):", len(prism))
print("PRISM rows (mapped to DepMap):", len(prism_mapped))
print("PRISM unique cell lines (mapped):", prism_mapped["join_id"].nunique())

PRISM rows (original): 753778
PRISM rows (mapped to DepMap): 738001
PRISM unique cell lines (mapped): 727


In [18]:
# Count number of cell lines per drug
drug_counts = (
    prism_mapped.groupby(["broad_id", "name"])["join_id"]
    .nunique()
    .reset_index(name="n_cell_lines")
    .sort_values("n_cell_lines", ascending=False)
)

print(drug_counts.head(10))

                    broad_id          name  n_cell_lines
1518  BRD-K99113996-001-02-0       AZD2014           719
1204  BRD-K73838513-003-05-5    cinacalcet           716
783   BRD-K42805893-001-04-9   osimertinib           714
1471  BRD-K95142244-001-01-5   talazoparib           711
1110  BRD-K67844266-003-01-9  pevonedistat           711
808   BRD-K44227013-001-06-4     ponatinib           711
666   BRD-K33622447-066-01-9   abemaciclib           708
1484  BRD-K96123349-236-02-8     brequinar           708
321   BRD-K08799216-001-05-3     pelitinib           708
980   BRD-K58529924-001-01-5        ONC201           708


In [16]:
# Filter drugs based on minimum number of cell lines
eligible_drugs = drug_counts[drug_counts["n_cell_lines"] >= MIN_CELL_LINES_PER_DRUG].copy()

print("Eligible drugs:", len(eligible_drugs))
print("Min n_cell_lines among eligible:", eligible_drugs["n_cell_lines"].min() if len(eligible_drugs) else None)
print("Max n_cell_lines among eligible:", eligible_drugs["n_cell_lines"].max() if len(eligible_drugs) else None)

Eligible drugs: 1528
Min n_cell_lines among eligible: 224
Max n_cell_lines among eligible: 719


In [17]:
# Filter PRISM data to only include eligible drugs
eligible_broad_ids = set(eligible_drugs["broad_id"])

prism_filtered = prism_mapped[prism_mapped["broad_id"].isin(eligible_broad_ids)].copy()

print("PRISM rows after drug filtering:", len(prism_filtered))
print("Unique cell lines after drug filtering:", prism_filtered["join_id"].nunique())
print("Unique drugs after filtering:", prism_filtered["broad_id"].nunique())


PRISM rows after drug filtering: 732066
Unique cell lines after drug filtering: 727
Unique drugs after filtering: 1528


## Build drug eligibility index and response statistics

In [20]:
# Build drug index with statistics
drug_index = (
    prism_filtered
    .groupby(["broad_id", "name"])
    .agg(
        n_cell_lines=("join_id", "nunique"),
        auc_mean=(TARGET_COL, "mean"),
        auc_std=(TARGET_COL, "std"),
        auc_min=(TARGET_COL, "min"),
        auc_max=(TARGET_COL, "max"),
    )
    .reset_index()
    .sort_values("n_cell_lines", ascending=False)
)

print(drug_index.head(10))

                    broad_id          name  n_cell_lines  auc_mean   auc_std  \
1507  BRD-K99113996-001-02-0       AZD2014           719  0.678668  0.087412   
1195  BRD-K73838513-003-05-5    cinacalcet           716  0.945610  0.114871   
777   BRD-K42805893-001-04-9   osimertinib           714  0.876731  0.116937   
1460  BRD-K95142244-001-01-5   talazoparib           711  0.684250  0.195109   
1103  BRD-K67844266-003-01-9  pevonedistat           711  0.672226  0.138740   
802   BRD-K44227013-001-06-4     ponatinib           711  0.780973  0.096771   
1473  BRD-K96123349-236-02-8     brequinar           708  0.806018  0.114410   
662   BRD-K33622447-066-01-9   abemaciclib           708  0.827241  0.127096   
974   BRD-K58529924-001-01-5        ONC201           708  0.894621  0.079943   
317   BRD-K08799216-001-05-3     pelitinib           708  0.733743  0.112063   

       auc_min   auc_max  
1507  0.419984  1.293848  
1195  0.624586  1.902010  
777   0.411908  1.609396  
1460  0.117

In [21]:
# Summary statistics of drug index
print("Number of drugs in index:", len(drug_index))
print("Min n_cell_lines:", drug_index["n_cell_lines"].min())
print("Max n_cell_lines:", drug_index["n_cell_lines"].max())
print("AUC mean range:", drug_index["auc_mean"].min(), "-", drug_index["auc_mean"].max())

Number of drugs in index: 1528
Min n_cell_lines: 224
Max n_cell_lines: 719
AUC mean range: 0.03283881436972296 - 1.547523572577436


## Save processed PRISM datasets

In [25]:
# Save filtered PRISM dataset
prism_filtered_path = DATA_PROCESSED / "prism_auc_filtered.parquet"

prism_filtered.to_parquet(prism_filtered_path, index=False)

print(f"Saved PRISM filtered dataset to: {prism_filtered_path}")

Saved PRISM filtered dataset to: ..\data\processed\prism_auc_filtered.parquet


In [26]:
# Save drug index dataset
drug_index_path = DATA_PROCESSED / "drug_index.parquet"

drug_index.to_parquet(drug_index_path, index=False)

print(f"Saved drug index to: {drug_index_path}")

Saved drug index to: ..\data\processed\drug_index.parquet


In [27]:
# Save summary statistics
summary = {
    "timestamp": datetime.utcnow().isoformat(),
    "n_prism_rows_original": int(len(prism)),
    "n_prism_rows_mapped": int(len(prism_mapped)),
    "n_prism_rows_filtered": int(len(prism_filtered)),
    "n_unique_cell_lines": int(prism_filtered["join_id"].nunique()),
    "n_unique_drugs": int(prism_filtered["broad_id"].nunique()),
    "min_cell_lines_per_drug": int(drug_index["n_cell_lines"].min()),
    "max_cell_lines_per_drug": int(drug_index["n_cell_lines"].max()),
    "target_column": TARGET_COL,
    "min_threshold_per_drug": MIN_CELL_LINES_PER_DRUG,
}

summary_path = REPORTS_DIR / "02_build_dataset_summary.json"

with open(summary_path, "w") as f:
    json.dump(summary, f, indent=2)

print(f"Summary saved to: {summary_path}")

Summary saved to: ..\reports\02_build_dataset_summary.json


## Load DepMap expression data (matched cell lines only)

In [28]:
depmap_expr = pd.read_csv(depmap_expr_path)

depmap_expr.shape

(1754, 19221)

In [None]:
depmap_expr["join_id"] = depmap_expr["ModelID"].apply(normalize_id)

eligible_join_ids = set(prism_filtered["join_id"].unique())

depmap_expr_matched = depmap_expr[depmap_expr["join_id"].isin(eligible_join_ids)].copy()

print("DepMap rows (original):", len(depmap_expr))
print("DepMap rows (matched):", len(depmap_expr_matched))
print("Unique matched cell lines:", depmap_expr_matched["join_id"].nunique())

In [None]:
gene_cols = [c for c in depmap_expr_matched.columns if c not in ["ModelID", "join_id"]]

X_expr = (
    depmap_expr_matched
    .set_index("join_id")[gene_cols]
)

print("Expression matrix shape:", X_expr.shape)