# Baseline modeling â€” DepMap expression & PRISM response

This notebook trains baseline regression models to predict PRISM drug response (AUC)
from DepMap gene expression profiles.

The modeling universe is defined by:

- 727 aligned cell lines
- 100 selected drugs (from Notebook 03)
- ~19,000 gene expression features
- Continuous AUC regression target

This notebook performs:

- Per-drug dataset assembly
- Train/validation/test split definition
- Baseline regression modeling
- Performance evaluation
- Preliminary feature attribution

## Imports and configurations 

In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
# Define project root and important directories
PROJECT_ROOT = Path("..").resolve()

DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
REPORTS_DIR = PROJECT_ROOT / "reports"
RESULTS_DIR = PROJECT_ROOT / "results"

RESULTS_DIR.mkdir(exist_ok=True, parents=True)

In [3]:
# Define target column and constants
TARGET_COL = "auc"

RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2

## Load 

In [4]:
prism = pd.read_parquet(DATA_PROCESSED / "prism_auc_filtered.parquet")
drug_index = pd.read_parquet(DATA_PROCESSED / "drug_index.parquet")
expr = pd.read_parquet(DATA_PROCESSED / "depmap_expression_matched.parquet")
selected_drugs = pd.read_parquet(DATA_PROCESSED / "selected_drugs.parquet")

In [5]:
print("PRISM:", prism.shape)
print("Expression:", expr.shape)
print("Selected drugs:", selected_drugs.shape)

PRISM: (732066, 5)
Expression: (751, 19220)
Selected drugs: (100, 7)


In [6]:
# Function to build dataset for a specific drug
def build_drug_dataset(prism_df, expr_df, broad_id, target_col):
    y_df = prism_df[prism_df["broad_id"] == broad_id][["join_id", target_col]]

    expr_num = expr_df.select_dtypes(include=[np.number]).copy()
    df = y_df.merge(expr_num, on="join_id", how="inner")

    return df

In [7]:
def split_dataset(df, target_col, test_size, val_size, random_state):
    drop_cols = [target_col]
    if "join_id" in df.columns:
        drop_cols.append("join_id")

    X = df.drop(columns=drop_cols)
    y = df[target_col]

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    val_fraction = val_size / (1 - test_size)

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val,
        test_size=val_fraction,
        random_state=random_state
    )

    return X_train, X_val, X_test, y_train, y_val, y_test

In [8]:
# Define models to evaluate
models = {
    "ridge": Ridge(alpha=1.0, random_state=RANDOM_STATE),
    "elasticnet": ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=RANDOM_STATE),
    "random_forest": RandomForestRegressor(
        n_estimators=200,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ),
}

In [10]:
# Evaluate models on a subset of drugs
test_drugs = selected_drugs["broad_id"].head(5).tolist()

results = []

for broad_id in test_drugs:
    df_drug = build_drug_dataset(prism, expr, broad_id, TARGET_COL)

    X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(
        df_drug, TARGET_COL, TEST_SIZE, VAL_SIZE, RANDOM_STATE
    )

    for name, model in models.items():
        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model),
        ])

        pipe.fit(X_train, y_train)

        y_val_pred = pipe.predict(X_val)
        y_test_pred = pipe.predict(X_test)

        results.append({
            "broad_id": broad_id,
            "model": name,
            "n_samples": len(df_drug),
            "r2_val": r2_score(y_val, y_val_pred),
            "r2_test": r2_score(y_test, y_test_pred),
            "mae_test": mean_absolute_error(y_test, y_test_pred),
            "rmse_test": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        })

results_df = pd.DataFrame(results)
results_df

KeyboardInterrupt: 

In [12]:
from sklearn.linear_model import Ridge, ElasticNet

models = {
    "ridge": Ridge(alpha=1.0),
    "elasticnet": ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000, random_state=RANDOM_STATE),
}


In [13]:
from sklearn.feature_selection import VarianceThreshold


In [14]:
pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True)),
    ("vt", VarianceThreshold(threshold=0.0)),
    ("model", model),
])


In [15]:
from sklearn.ensemble import RandomForestRegressor

models["rf_fast"] = RandomForestRegressor(
    n_estimators=50,
    max_features="sqrt",
    n_jobs=-1,
    random_state=RANDOM_STATE,
)


In [16]:
test_drugs = selected_drugs["broad_id"].head(5).tolist()
results = []

for i, broad_id in enumerate(test_drugs, start=1):
    print(f"[{i}/{len(test_drugs)}] Drug: {broad_id}")
    df_drug = build_drug_dataset(prism, expr, broad_id, TARGET_COL)

    X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(
        df_drug, TARGET_COL, TEST_SIZE, VAL_SIZE, RANDOM_STATE
    )

    for name, model in models.items():
        print(f"  - model: {name}")
        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model),
        ])

        pipe.fit(X_train, y_train)

        y_val_pred = pipe.predict(X_val)
        y_test_pred = pipe.predict(X_test)

        results.append({
            "broad_id": broad_id,
            "model": name,
            "n_samples": len(df_drug),
            "r2_val": r2_score(y_val, y_val_pred),
            "r2_test": r2_score(y_test, y_test_pred),
            "mae_test": mean_absolute_error(y_test, y_test_pred),
            "rmse_test": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        })

results_df = pd.DataFrame(results)
results_df


[1/5] Drug: BRD-K95142244-001-01-5
  - model: ridge
  - model: elasticnet
  - model: rf_fast
[2/5] Drug: BRD-K50168500-001-07-9
  - model: ridge
  - model: elasticnet
  - model: rf_fast
[3/5] Drug: BRD-K33610132-001-02-9
  - model: ridge
  - model: elasticnet
  - model: rf_fast
[4/5] Drug: BRD-A70858459-001-01-7
  - model: ridge
  - model: elasticnet
  - model: rf_fast
[5/5] Drug: BRD-K77625799-001-07-7
  - model: ridge
  - model: elasticnet
  - model: rf_fast


Unnamed: 0,broad_id,model,n_samples,r2_val,r2_test,mae_test,rmse_test
0,BRD-K95142244-001-01-5,ridge,1655,0.228672,0.246818,0.123407,0.171468
1,BRD-K95142244-001-01-5,elasticnet,1655,-0.340551,0.172339,0.125295,0.179746
2,BRD-K95142244-001-01-5,rf_fast,1655,0.356381,0.301397,0.119756,0.165139
3,BRD-K50168500-001-07-9,ridge,1167,-0.530288,-0.329421,0.125735,0.168933
4,BRD-K50168500-001-07-9,elasticnet,1167,-0.047576,0.062922,0.100634,0.141831
5,BRD-K50168500-001-07-9,rf_fast,1167,-0.091798,0.037269,0.098647,0.143759
6,BRD-K33610132-001-02-9,ridge,1153,-1.041094,-0.2301,0.142637,0.233405
7,BRD-K33610132-001-02-9,elasticnet,1153,-0.404254,-0.067278,0.12546,0.21741
8,BRD-K33610132-001-02-9,rf_fast,1153,-0.472095,-0.088177,0.12598,0.219528
9,BRD-A70858459-001-01-7,ridge,1140,-1.284864,-0.908261,0.17967,0.248703
