<a href="https://colab.research.google.com/github/navidh86/perturbseq-10701/blob/master/baseline_classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# # ONLY FOR COLAB
# !git clone https://github.com/navidh86/perturbseq-10701.git
# %cd ./perturbseq-10701
# !pip install fastparquet tqdm

In [3]:
# Imports and device
import os
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from tqdm import tqdm

from data.reference_data_classification import get_dataloader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


In [4]:
# # Create dataloaders (point to data/ paths explicitly)
# train_loader = get_dataloader(
#     parquet_path='data/tf_gene_expression_labeled.parquet',
#     tf_sequences_path='data/tf_sequences.pkl',
#     gene_sequences_path='data/gene_sequences_4000bp.pkl',
#     batch_size=128,
#     type='train',
#     majority_fraction=0.01
# )
# test_loader = get_dataloader(
#     parquet_path='data/tf_gene_expression_labeled.parquet',
#     tf_sequences_path='data/tf_sequences.pkl',
#     gene_sequences_path='data/gene_sequences_4000bp.pkl',
#     batch_size=256,
#     type='test',
#     majority_fraction=0.01
# )

# print('Train size:', len(train_loader.dataset))
# print('Test size :', len(test_loader.dataset))

# Create dataloaders (point to data/ paths explicitly)
train_loader = get_dataloader(
    parquet_path='data/tf_gene_expression_labeled_v2.parquet',
    tf_sequences_path='data/tf_sequences.pkl',
    gene_sequences_path='data/gene_sequences_4000bp.pkl',
    batch_size=128,
    type='train',
    majority_fraction=0.005
)

validation_loader = get_dataloader(
    parquet_path='data/tf_gene_expression_labeled_v2.parquet',
    tf_sequences_path='data/tf_sequences.pkl',
    gene_sequences_path='data/gene_sequences_4000bp.pkl',
    batch_size=256,
    type='val',
    majority_fraction=0.005
)

test_loader = get_dataloader(
    parquet_path='data/tf_gene_expression_labeled_v2.parquet',
    tf_sequences_path='data/tf_sequences.pkl',
    gene_sequences_path='data/gene_sequences_4000bp.pkl',
    batch_size=256,
    type='test',
    majority_fraction=0.005
)

print('Train size:', len(train_loader.dataset))
print('Validation size :', len(validation_loader.dataset))
print('Test size :', len(test_loader.dataset))

Train size: 10845
Validation size : 2324
Test size : 2325


In [5]:
train_ds = train_loader.dataset
validation_ds = validation_loader.dataset
test_ds = test_loader.dataset

combined_df = pd.concat([train_ds.df, validation_ds.df, test_ds.df]).reset_index(drop=True)

# unique names from combined set
tf_names = combined_df['tf_name'].unique().tolist()
gene_names = combined_df['gene_name'].unique().tolist()

# create mappings
tf_to_id = {n: i for i, n in enumerate(tf_names)}
gene_to_id = {n: i for i, n in enumerate(gene_names)}

num_tfs = len(tf_to_id)
num_genes = len(gene_to_id)
# Use classes from training split
num_classes = len(train_ds.df['expression_label'].unique())

print('Unique TFs (combined):', num_tfs)
print('Unique Genes (combined):', num_genes)
print('Num classes:', num_classes)

Unique TFs (combined): 223
Unique Genes (combined): 4539
Num classes: 3


In [6]:
import numpy as np

def prepare_data_for_xgboost(loader, tf_to_id, gene_to_id):
    """Extract raw TF and Gene IDs for XGBoost."""
    X = []
    y = []
    
    for batch_x, batch_y in loader:
        for item in batch_x:
            tf_id = tf_to_id[item['tf_name']]
            gene_id = gene_to_id[item['gene_name']]
            X.append([tf_id, gene_id])
        
        y.extend(batch_y.numpy())
    
    return np.array(X), np.array(y)

print("Preparing training data (raw IDs)...")
X_train, y_train = prepare_data_for_xgboost(train_loader, tf_to_id, gene_to_id)
print(f"Train shape: X={X_train.shape}, y={y_train.shape}")

print("Preparing validation data...")
X_val, y_val = prepare_data_for_xgboost(validation_loader, tf_to_id, gene_to_id)
print(f"Val shape: X={X_val.shape}, y={y_val.shape}")

print("Preparing test data...")
X_test, y_test = prepare_data_for_xgboost(test_loader, tf_to_id, gene_to_id)
print(f"Test shape: X={X_test.shape}, y={y_test.shape}")

Preparing training data (raw IDs)...
Train shape: X=(10845, 2), y=(10845,)
Preparing validation data...
Val shape: X=(2324, 2), y=(2324,)
Preparing test data...
Test shape: X=(2325, 2), y=(2325,)


In [7]:
# from sklearn.preprocessing import OneHotEncoder
# import numpy as np

# def prepare_data_onehot_separate(loader, tf_to_id, gene_to_id):
#     """One-hot encode TF and Gene IDs separately."""
#     tf_ids = []
#     gene_ids = []
#     y = []
    
#     for batch_x, batch_y in loader:
#         for item in batch_x:
#             tf_id = tf_to_id[item['tf_name']]
#             gene_id = gene_to_id[item['gene_name']]
#             tf_ids.append([tf_id])
#             gene_ids.append([gene_id])
        
#         y.extend(batch_y.numpy())
    
#     tf_ids = np.array(tf_ids)
#     gene_ids = np.array(gene_ids)
    
#     # Create separate encoders
#     tf_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
#     gene_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    
#     # Encode separately
#     tf_onehot = tf_encoder.fit_transform(tf_ids)
#     gene_onehot = gene_encoder.fit_transform(gene_ids)
    
#     # Concatenate
#     X_onehot = np.hstack([tf_onehot, gene_onehot])
    
#     return X_onehot, np.array(y), tf_encoder, gene_encoder

# # Prepare training data with separate one-hot encoding
# print("Preparing training data with separate one-hot encoding...")
# X_train, y_train, tf_encoder, gene_encoder = prepare_data_onehot_separate(train_loader, tf_to_id, gene_to_id)
# print(f"Train shape after one-hot: X={X_train.shape}, y={y_train.shape}")
# print(f"  TF features: {tf_encoder.n_features_in_} -> {len(tf_encoder.get_feature_names_out())}")
# print(f"  Gene features: {gene_encoder.n_features_in_} -> {len(gene_encoder.get_feature_names_out())}")

# # For validation and test, use the same encoders
# def prepare_data_with_separate_encoders(loader, tf_to_id, gene_to_id, tf_encoder, gene_encoder):
#     tf_ids = []
#     gene_ids = []
#     y = []
    
#     for batch_x, batch_y in loader:
#         for item in batch_x:
#             tf_id = tf_to_id[item['tf_name']]
#             gene_id = gene_to_id[item['gene_name']]
#             tf_ids.append([tf_id])
#             gene_ids.append([gene_id])
        
#         y.extend(batch_y.numpy())
    
#     tf_ids = np.array(tf_ids)
#     gene_ids = np.array(gene_ids)
    
#     # Transform using fitted encoders
#     tf_onehot = tf_encoder.transform(tf_ids)
#     gene_onehot = gene_encoder.transform(gene_ids)
    
#     # Concatenate
#     X_onehot = np.hstack([tf_onehot, gene_onehot])
    
#     return X_onehot, np.array(y)

# X_val, y_val = prepare_data_with_separate_encoders(validation_loader, tf_to_id, gene_to_id, tf_encoder, gene_encoder)
# X_test, y_test = prepare_data_with_separate_encoders(test_loader, tf_to_id, gene_to_id, tf_encoder, gene_encoder)

# print(f"Val shape: X={X_val.shape}, y={y_val.shape}")
# print(f"Test shape: X={X_test.shape}, y={y_test.shape}")

In [8]:
print(X_train[0])

[   9 1308]


In [9]:
#!pip install xgboost

In [10]:
# XGBoost and metrics
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score
import json

In [11]:
# Cell - Train XGBoost with STRONGER regularization
print("\nTraining XGBoost with stronger regularization...")
xgb_model = XGBClassifier(
    n_estimators=100,          # Reduce from 200
    max_depth=4,               # Reduce from 8 (shallower trees)
    learning_rate=0.05,        # Reduce from 0.1 (slower learning)
    subsample=0.6,             # Reduce from 0.8 (more aggressive sampling)
    colsample_bytree=0.6,      # Reduce from 0.8
    colsample_bylevel=0.6,     # Sample features per level
    colsample_bynode=0.6,      # Sample features per node
    reg_alpha=1.0,             # Increase L1 (was 0.1)
    reg_lambda=5.0,            # Increase L2 (was 1.0)
    min_child_weight=5,        # Minimum samples in leaf (NEW)
    gamma=0.1,                 # Minimum loss reduction (NEW)
    random_state=10701,
    n_jobs=-1,
    tree_method='hist',
    enable_categorical=True,
    max_cat_to_onehot=1,
    eval_metric='mlogloss',
    early_stopping_rounds=20,  # Increase from 15
    verbosity=1
)

# Fit with validation set for early stopping
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=10
)
print("Training complete!")
print(f"Best iteration: {xgb_model.best_iteration}")
print(f"Best score: {xgb_model.best_score:.4f}")


Training XGBoost with stronger regularization...
[0]	validation_0-mlogloss:1.09250
[10]	validation_0-mlogloss:1.05954
[20]	validation_0-mlogloss:1.06331
[30]	validation_0-mlogloss:1.09487
Training complete!
Best iteration: 10
Best score: 1.0595


In [12]:
# Evaluate on all sets
for name, X, y in [('Train', X_train, y_train), 
                    ('Val', X_val, y_val), 
                    ('Test', X_test, y_test)]:
    y_pred = xgb_model.predict(X)
    acc = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='macro')
    print(f"\n=== {name} Set ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Macro F1: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y, y_pred, digits=4))


=== Train Set ===
Accuracy: 0.6620
Macro F1: 0.6488
Classification Report:
              precision    recall  f1-score   support

           0     0.7524    0.4568    0.5684      3573
           1     0.6313    0.9724    0.7656      3409
           2     0.6517    0.5778    0.6125      3863

    accuracy                         0.6620     10845
   macro avg     0.6785    0.6690    0.6488     10845
weighted avg     0.6785    0.6620    0.6461     10845


=== Val Set ===
Accuracy: 0.4423
Macro F1: 0.4424
Classification Report:
              precision    recall  f1-score   support

           0     0.4155    0.4269    0.4211       766
           1     0.4447    0.4630    0.4537       730
           2     0.4672    0.4384    0.4523       828

    accuracy                         0.4423      2324
   macro avg     0.4425    0.4428    0.4424      2324
weighted avg     0.4431    0.4423    0.4425      2324


=== Test Set ===
Accuracy: 0.4520
Macro F1: 0.4517
Classification Report:
             