<a href="https://colab.research.google.com/github/navidh86/perturbseq-10701/blob/master/baseline_classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# # ONLY FOR COLAB
# !git clone https://github.com/navidh86/perturbseq-10701.git
# %cd ./perturbseq-10701
# !pip install fastparquet tqdm

In [2]:
# Imports and device
import os
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from tqdm import tqdm

from data.reference_data_classification import get_dataloader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


In [3]:
# # Create dataloaders (point to data/ paths explicitly)
# train_loader = get_dataloader(
#     parquet_path='data/tf_gene_expression_labeled.parquet',
#     tf_sequences_path='data/tf_sequences.pkl',
#     gene_sequences_path='data/gene_sequences_4000bp.pkl',
#     batch_size=128,
#     type='train',
#     majority_fraction=0.01
# )
# test_loader = get_dataloader(
#     parquet_path='data/tf_gene_expression_labeled.parquet',
#     tf_sequences_path='data/tf_sequences.pkl',
#     gene_sequences_path='data/gene_sequences_4000bp.pkl',
#     batch_size=256,
#     type='test',
#     majority_fraction=0.01
# )

# print('Train size:', len(train_loader.dataset))
# print('Test size :', len(test_loader.dataset))

# Create dataloaders (point to data/ paths explicitly)
train_loader = get_dataloader(
    parquet_path='data/tf_gene_expression_labeled_v2.parquet',
    tf_sequences_path='data/tf_sequences.pkl',
    gene_sequences_path='data/gene_sequences_4000bp.pkl',
    batch_size=128,
    type='train',
    majority_fraction=0.005
)

validation_loader = get_dataloader(
    parquet_path='data/tf_gene_expression_labeled_v2.parquet',
    tf_sequences_path='data/tf_sequences.pkl',
    gene_sequences_path='data/gene_sequences_4000bp.pkl',
    batch_size=256,
    type='val',
    majority_fraction=0.005
)

test_loader = get_dataloader(
    parquet_path='data/tf_gene_expression_labeled_v2.parquet',
    tf_sequences_path='data/tf_sequences.pkl',
    gene_sequences_path='data/gene_sequences_4000bp.pkl',
    batch_size=256,
    type='test',
    majority_fraction=0.005
)

print('Train size:', len(train_loader.dataset))
print('Validation size :', len(validation_loader.dataset))
print('Test size :', len(test_loader.dataset))

Train size: 10845
Validation size : 2324
Test size : 2325


In [4]:
train_ds = train_loader.dataset
validation_ds = validation_loader.dataset
test_ds = test_loader.dataset

combined_df = pd.concat([train_ds.df, validation_ds.df, test_ds.df]).reset_index(drop=True)

# unique names from combined set
tf_names = combined_df['tf_name'].unique().tolist()
gene_names = combined_df['gene_name'].unique().tolist()

# create mappings
tf_to_id = {n: i for i, n in enumerate(tf_names)}
gene_to_id = {n: i for i, n in enumerate(gene_names)}

num_tfs = len(tf_to_id)
num_genes = len(gene_to_id)
# Use classes from training split
num_classes = len(train_ds.df['expression_label'].unique())

print('Unique TFs (combined):', num_tfs)
print('Unique Genes (combined):', num_genes)
print('Num classes:', num_classes)

Unique TFs (combined): 223
Unique Genes (combined): 4539
Num classes: 3


In [5]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

def prepare_data_onehot_separate(loader, tf_to_id, gene_to_id):
    """One-hot encode TF and Gene IDs separately."""
    tf_ids = []
    gene_ids = []
    y = []
    
    for batch_x, batch_y in loader:
        for item in batch_x:
            tf_id = tf_to_id[item['tf_name']]
            gene_id = gene_to_id[item['gene_name']]
            tf_ids.append([tf_id])
            gene_ids.append([gene_id])
        
        y.extend(batch_y.numpy())
    
    tf_ids = np.array(tf_ids)
    gene_ids = np.array(gene_ids)
    
    # Create separate encoders
    tf_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    gene_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    
    # Encode separately
    tf_onehot = tf_encoder.fit_transform(tf_ids)
    gene_onehot = gene_encoder.fit_transform(gene_ids)
    
    # Concatenate
    X_onehot = np.hstack([tf_onehot, gene_onehot])
    
    return X_onehot, np.array(y), tf_encoder, gene_encoder

# Prepare training data with separate one-hot encoding
print("Preparing training data with separate one-hot encoding...")
X_train, y_train, tf_encoder, gene_encoder = prepare_data_onehot_separate(train_loader, tf_to_id, gene_to_id)
print(f"Train shape after one-hot: X={X_train.shape}, y={y_train.shape}")
print(f"  TF features: {tf_encoder.n_features_in_} -> {len(tf_encoder.get_feature_names_out())}")
print(f"  Gene features: {gene_encoder.n_features_in_} -> {len(gene_encoder.get_feature_names_out())}")

# For validation and test, use the same encoders
def prepare_data_with_separate_encoders(loader, tf_to_id, gene_to_id, tf_encoder, gene_encoder):
    tf_ids = []
    gene_ids = []
    y = []
    
    for batch_x, batch_y in loader:
        for item in batch_x:
            tf_id = tf_to_id[item['tf_name']]
            gene_id = gene_to_id[item['gene_name']]
            tf_ids.append([tf_id])
            gene_ids.append([gene_id])
        
        y.extend(batch_y.numpy())
    
    tf_ids = np.array(tf_ids)
    gene_ids = np.array(gene_ids)
    
    # Transform using fitted encoders
    tf_onehot = tf_encoder.transform(tf_ids)
    gene_onehot = gene_encoder.transform(gene_ids)
    
    # Concatenate
    X_onehot = np.hstack([tf_onehot, gene_onehot])
    
    return X_onehot, np.array(y)

X_val, y_val = prepare_data_with_separate_encoders(validation_loader, tf_to_id, gene_to_id, tf_encoder, gene_encoder)
X_test, y_test = prepare_data_with_separate_encoders(test_loader, tf_to_id, gene_to_id, tf_encoder, gene_encoder)

print(f"Val shape: X={X_val.shape}, y={y_val.shape}")
print(f"Test shape: X={X_test.shape}, y={y_test.shape}")

Preparing training data with separate one-hot encoding...
Train shape after one-hot: X=(10845, 4239), y=(10845,)
  TF features: 1 -> 223
  Gene features: 1 -> 4016
Val shape: X=(2324, 4239), y=(2324,)
Test shape: X=(2325, 4239), y=(2325,)


In [6]:
print(X_train[0])

[0. 0. 0. ... 0. 0. 0.]


In [7]:
#!pip install xgboost

In [8]:
# XGBoost and metrics
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score
import json

In [12]:
# Cell 10 - Train XGBoost with one-hot encoded features
print("\nTraining XGBoost with one-hot encoding...")
# xgb_model = XGBClassifier(
#     n_estimators=200,          # More trees for high-dimensional data
#     max_depth=10,               # Moderate depth
#     learning_rate=0.1,         # Standard learning rate
#     subsample=0.8,             # Sample 80% of data
#     colsample_bytree=0.8,      # Sample 80% of features
#     reg_alpha=0.1,             # L1 regularization
#     reg_lambda=1.0,            # L2 regularization
#     min_child_weight=3,        # Minimum samples in leaf
#     gamma=0.01,                # Minimum loss reduction
#     random_state=10701,
#     n_jobs=-1,
#     tree_method='hist',        # Efficient for sparse data
#     eval_metric='mlogloss',
#     early_stopping_rounds=15,
#     verbosity=0
# )

# {'n_estimators': 250, 'max_depth': 12, 'learning_rate': 0.15, 'subsample': 0.85, 'colsample_bytree': 0.85, 'reg_lambda': 1.0}
xgb_model = XGBClassifier(n_estimators=250,
                          max_depth=12,
                          learning_rate=0.15,
                          subsample=0.85,
                          colsample_bytree=0.85, 
                          reg_lambda=1.0,
                          random_state=10701,
                          n_jobs=-1,
                          tree_method='hist',
                          eval_metric='mlogloss',  
                          early_stopping_rounds=15,
                          verbosity=0)   



# Fit with validation set for early stopping
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=10
)
print("Training complete!")
print(f"Best iteration: {xgb_model.best_iteration}")
print(f"Best score: {xgb_model.best_score:.4f}")


Training XGBoost with one-hot encoding...
[0]	validation_0-mlogloss:1.07344
[10]	validation_0-mlogloss:0.94696
[20]	validation_0-mlogloss:0.89530
[30]	validation_0-mlogloss:0.86320
[40]	validation_0-mlogloss:0.84048
[50]	validation_0-mlogloss:0.82059
[60]	validation_0-mlogloss:0.80526
[70]	validation_0-mlogloss:0.79284
[80]	validation_0-mlogloss:0.78122
[90]	validation_0-mlogloss:0.77053
[100]	validation_0-mlogloss:0.76100
[110]	validation_0-mlogloss:0.75289
[120]	validation_0-mlogloss:0.74535
[130]	validation_0-mlogloss:0.73780
[140]	validation_0-mlogloss:0.73144
[150]	validation_0-mlogloss:0.72534
[160]	validation_0-mlogloss:0.71954
[170]	validation_0-mlogloss:0.71416
[180]	validation_0-mlogloss:0.70919
[190]	validation_0-mlogloss:0.70436
[200]	validation_0-mlogloss:0.70028
[210]	validation_0-mlogloss:0.69569
[220]	validation_0-mlogloss:0.69204
[230]	validation_0-mlogloss:0.68875
[240]	validation_0-mlogloss:0.68461
[249]	validation_0-mlogloss:0.68214
Training complete!
Best iteratio

In [13]:
# Evaluate on all sets
for name, X, y in [('Train', X_train, y_train), 
                    ('Val', X_val, y_val), 
                    ('Test', X_test, y_test)]:
    y_pred = xgb_model.predict(X)
    acc = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='macro')
    print(f"\n=== {name} Set ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Macro F1: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y, y_pred, digits=4))


=== Train Set ===
Accuracy: 0.7517
Macro F1: 0.7516
Classification Report:
              precision    recall  f1-score   support

           0     0.6977    0.7876    0.7399      3573
           1     0.8268    0.8909    0.8577      3409
           2     0.7330    0.5957    0.6572      3863

    accuracy                         0.7517     10845
   macro avg     0.7525    0.7580    0.7516     10845
weighted avg     0.7509    0.7517    0.7475     10845


=== Val Set ===
Accuracy: 0.6936
Macro F1: 0.6904
Classification Report:
              precision    recall  f1-score   support

           0     0.6307    0.7467    0.6838       766
           1     0.7926    0.8849    0.8362       730
           2     0.6545    0.4758    0.5510       828

    accuracy                         0.6936      2324
   macro avg     0.6926    0.7025    0.6904      2324
weighted avg     0.6900    0.6936    0.6844      2324


=== Test Set ===
Accuracy: 0.7191
Macro F1: 0.7161
Classification Report:
             

# Grid search

In [11]:
# Manual Grid Search with tqdm
from sklearn.model_selection import cross_val_score
from itertools import product
import numpy as np

param_grid = {
    'n_estimators': [150, 250],
    'max_depth': [8, 12],
    'learning_rate': [0.05, 0.15],
    'subsample': [0.7, 0.85],
    'colsample_bytree': [0.7, 0.85],
    'reg_lambda': [1.0, 2.0]
    # Removed min_child_weight and reg_alpha to reduce combinations
}

# Generate all combinations
keys = list(param_grid.keys())
values = list(param_grid.values())
param_combinations = [dict(zip(keys, v)) for v in product(*values)]

print(f"Total combinations to test: {len(param_combinations)}")

# Manual grid search with progress bar
best_score = -np.inf
best_params = None
results = []

for params in tqdm(param_combinations, desc="Grid Search"):
    model = XGBClassifier(
        **params,
        random_state=10701,
        n_jobs=-1,
        tree_method='hist',
        eval_metric='mlogloss',
        verbosity=0
    )
    
    # Cross-validation
    scores = cross_val_score(
        model, X_train, y_train,
        cv=3,
        scoring='f1_macro',
        n_jobs=1  # Important: don't parallelize CV
    )
    
    mean_score = scores.mean()
    std_score = scores.std()
    
    results.append({
        'params': params,
        'mean_f1': mean_score,
        'std_f1': std_score
    })
    
    if mean_score > best_score:
        best_score = mean_score
        best_params = params
        print(f"  New best! F1={mean_score:.4f} (±{std_score:.4f})")

print("\n✓ Grid Search Complete!")
print(f"Best CV F1: {best_score:.4f}")
print(f"Best parameters: {best_params}")

# Train final model with best params
print("\nTraining final model with best parameters...")
xgb_model = XGBClassifier(
    **best_params,
    random_state=10701,
    n_jobs=-1,
    tree_method='hist',
    eval_metric='mlogloss',
    verbosity=0
)

xgb_model.fit(X_train, y_train)

# Evaluate
for name, X, y in [('Train', X_train, y_train), 
                    ('Val', X_val, y_val), 
                    ('Test', X_test, y_test)]:
    y_pred = xgb_model.predict(X)
    acc = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='macro')
    print(f"\n=== {name} Set ===")
    print(f"Accuracy: {acc:.4f}, Macro F1: {f1:.4f}")

# Save results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('mean_f1', ascending=False)
results_df.to_csv('results/grid_search_results.csv', index=False)
print("\nTop 10 parameter combinations saved to results/grid_search_results.csv")
print(results_df.head(10))

Total combinations to test: 64


Grid Search:   2%|▏         | 1/64 [00:16<17:23, 16.56s/it]

  New best! F1=0.5999 (±0.0139)


Grid Search:   5%|▍         | 3/64 [00:57<19:47, 19.47s/it]

  New best! F1=0.6020 (±0.0134)


Grid Search:  14%|█▍        | 9/64 [02:36<14:44, 16.09s/it]

  New best! F1=0.6551 (±0.0063)


Grid Search:  17%|█▋        | 11/64 [03:04<13:19, 15.09s/it]

  New best! F1=0.6576 (±0.0076)


Grid Search:  39%|███▉      | 25/64 [07:07<11:46, 18.12s/it]

  New best! F1=0.6730 (±0.0080)


Grid Search:  42%|████▏     | 27/64 [07:41<10:47, 17.50s/it]

  New best! F1=0.6742 (±0.0055)


Grid Search:  64%|██████▍   | 41/64 [12:52<09:23, 24.49s/it]

  New best! F1=0.6763 (±0.0053)


Grid Search:  73%|███████▎  | 47/64 [15:10<06:29, 22.89s/it]

  New best! F1=0.6765 (±0.0052)


Grid Search:  89%|████████▉ | 57/64 [19:46<03:08, 26.86s/it]

  New best! F1=0.6857 (±0.0053)


Grid Search:  92%|█████████▏| 59/64 [20:36<02:08, 25.77s/it]

  New best! F1=0.6866 (±0.0062)


Grid Search:  95%|█████████▌| 61/64 [21:25<01:15, 25.13s/it]

  New best! F1=0.6874 (±0.0066)


Grid Search:  98%|█████████▊| 63/64 [22:14<00:24, 24.87s/it]

  New best! F1=0.6882 (±0.0061)


Grid Search: 100%|██████████| 64/64 [22:39<00:00, 21.24s/it]



✓ Grid Search Complete!
Best CV F1: 0.6882
Best parameters: {'n_estimators': 250, 'max_depth': 12, 'learning_rate': 0.15, 'subsample': 0.85, 'colsample_bytree': 0.85, 'reg_lambda': 1.0}

Training final model with best parameters...

=== Train Set ===
Accuracy: 0.7517, Macro F1: 0.7516

=== Val Set ===
Accuracy: 0.6936, Macro F1: 0.6904

=== Test Set ===
Accuracy: 0.7191, Macro F1: 0.7161

Top 10 parameter combinations saved to results/grid_search_results.csv
                                               params   mean_f1    std_f1
62  {'n_estimators': 250, 'max_depth': 12, 'learni...  0.688155  0.006108
60  {'n_estimators': 250, 'max_depth': 12, 'learni...  0.687379  0.006591
58  {'n_estimators': 250, 'max_depth': 12, 'learni...  0.686594  0.006224
56  {'n_estimators': 250, 'max_depth': 12, 'learni...  0.685724  0.005347
63  {'n_estimators': 250, 'max_depth': 12, 'learni...  0.681179  0.007222
61  {'n_estimators': 250, 'max_depth': 12, 'learni...  0.680978  0.007099
57  {'n_estimators