<a href="https://colab.research.google.com/github/navidh86/perturbseq-10701/blob/master/baseline_classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# # ONLY FOR COLAB
# !git clone https://github.com/navidh86/perturbseq-10701.git
# %cd ./perturbseq-10701
# !pip install fastparquet tqdm

In [1]:
# Imports and device
import os
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from tqdm import tqdm

from data.reference_data_classification import get_dataloader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


In [2]:
# # Create dataloaders (point to data/ paths explicitly)
# train_loader = get_dataloader(
#     parquet_path='data/tf_gene_expression_labeled.parquet',
#     tf_sequences_path='data/tf_sequences.pkl',
#     gene_sequences_path='data/gene_sequences_4000bp.pkl',
#     batch_size=128,
#     type='train',
#     majority_fraction=0.01
# )
# test_loader = get_dataloader(
#     parquet_path='data/tf_gene_expression_labeled.parquet',
#     tf_sequences_path='data/tf_sequences.pkl',
#     gene_sequences_path='data/gene_sequences_4000bp.pkl',
#     batch_size=256,
#     type='test',
#     majority_fraction=0.01
# )

# print('Train size:', len(train_loader.dataset))
# print('Test size :', len(test_loader.dataset))

# Create dataloaders (point to data/ paths explicitly)
train_loader = get_dataloader(
    parquet_path='data/tf_gene_expression_labeled_v2.parquet',
    tf_sequences_path='data/tf_sequences.pkl',
    gene_sequences_path='data/gene_sequences_4000bp.pkl',
    batch_size=128,
    type='train',
    majority_fraction=0.005
)

validation_loader = get_dataloader(
    parquet_path='data/tf_gene_expression_labeled_v2.parquet',
    tf_sequences_path='data/tf_sequences.pkl',
    gene_sequences_path='data/gene_sequences_4000bp.pkl',
    batch_size=256,
    type='val',
    majority_fraction=0.005
)

test_loader = get_dataloader(
    parquet_path='data/tf_gene_expression_labeled_v2.parquet',
    tf_sequences_path='data/tf_sequences.pkl',
    gene_sequences_path='data/gene_sequences_4000bp.pkl',
    batch_size=256,
    type='test',
    majority_fraction=0.005
)

print('Train size:', len(train_loader.dataset))
print('Validation size :', len(validation_loader.dataset))
print('Test size :', len(test_loader.dataset))

Train size: 10845
Validation size : 2324
Test size : 2325


In [3]:
train_ds = train_loader.dataset
validation_ds = validation_loader.dataset
test_ds = test_loader.dataset

combined_df = pd.concat([train_ds.df, validation_ds.df, test_ds.df]).reset_index(drop=True)

# unique names from combined set
tf_names = combined_df['tf_name'].unique().tolist()
gene_names = combined_df['gene_name'].unique().tolist()

# create mappings
tf_to_id = {n: i for i, n in enumerate(tf_names)}
gene_to_id = {n: i for i, n in enumerate(gene_names)}

num_tfs = len(tf_to_id)
num_genes = len(gene_to_id)
# Use classes from training split
num_classes = len(train_ds.df['expression_label'].unique())

print('Unique TFs (combined):', num_tfs)
print('Unique Genes (combined):', num_genes)
print('Num classes:', num_classes)

Unique TFs (combined): 223
Unique Genes (combined): 4539
Num classes: 3


In [None]:
# # Random Forest Classification
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import f1_score, classification_report, accuracy_score
# import numpy as np
# import json
# import pickle
# import os

# # Prepare data for sklearn (convert DataLoader to numpy arrays)
# def prepare_data_for_sklearn(loader, tf_to_id, gene_to_id):
#     """Extract features and labels from DataLoader for sklearn."""
#     X = []
#     y = []
    
#     for batch_x, batch_y in loader:
#         for item in batch_x:
#             tf_id = tf_to_id[item['tf_name']]
#             gene_id = gene_to_id[item['gene_name']]
#             X.append([tf_id, gene_id])
        
#         y.extend(batch_y.numpy())
    
#     return np.array(X), np.array(y)

# print("Preparing training data...")
# X_train, y_train = prepare_data_for_sklearn(train_loader, tf_to_id, gene_to_id)
# print(f"Train shape: X={X_train.shape}, y={y_train.shape}")

# print("Preparing validation data...")
# X_val, y_val = prepare_data_for_sklearn(validation_loader, tf_to_id, gene_to_id)
# print(f"Validation shape: X={X_val.shape}, y={y_val.shape}")

# print("Preparing test data...")
# X_test, y_test = prepare_data_for_sklearn(test_loader, tf_to_id, gene_to_id)
# print(f"Test shape: X={X_test.shape}, y={y_test.shape}")

In [4]:
# load NT sequence pca embeddings
tf_embed_cache_nt = pickle.load(open("./embeds/tf_cls_pca.pkl", "rb"))
gene_embed_cache_nt = pickle.load(open("./embeds/gn_cls_pca.pkl", "rb"))

# Load enformer pca embeddings
tf_embed_cache = pickle.load(open("./embeds/tf_enformer_alternate_pca.pkl", "rb"))
gene_embed_cache = pickle.load(open("./embeds/gn_enformer_alternate_pca.pkl", "rb"))

# # ensure everything is torch tensors
# for k in tf_embed_cache:
#     if not isinstance(tf_embed_cache[k], torch.Tensor):
#         tf_embed_cache[k] = torch.tensor(tf_embed_cache[k], dtype=torch.float32)

# for k in gene_embed_cache:
#     if not isinstance(gene_embed_cache[k], torch.Tensor):
#         gene_embed_cache[k] = torch.tensor(gene_embed_cache[k], dtype=torch.float32)
first_tf_nt = next(iter(tf_embed_cache_nt.values()))
first_gene_nt = next(iter(gene_embed_cache_nt.values()))
print("TF NT emb dim:", first_tf_nt.shape)
print("Gene NT emb dim:", first_gene_nt.shape)

first_tf = next(iter(tf_embed_cache.values()))
first_gene = next(iter(gene_embed_cache.values()))
print("TF enformer emb dim:", first_tf.shape)
print("Gene enformer emb dim:", first_gene.shape)

TF NT emb dim: torch.Size([110])
Gene NT emb dim: torch.Size([528])
TF enformer emb dim: torch.Size([18])
Gene enformer emb dim: torch.Size([26])


In [5]:
import numpy as np

def prepare_combined(tf_name, gene_name):
    tf_id = tf_to_id[tf_name]
    gene_id = gene_to_id[gene_name]
    
    # tf_one_hot = one_hot(tf_id, num_tfs)
    # gene_one_hot = one_hot(gene_id, num_genes)

    tf_embed_nt = tf_embed_cache_nt[tf_name]
    gene_embed_nt = gene_embed_cache_nt[gene_name]
    
    tf_embed = tf_embed_cache[tf_name]
    gene_embed = gene_embed_cache[gene_name]

    # interaction = torch.nn.functional.pad(tf_embed, (0, len(gene_embed)-len(tf_embed))) * gene_embed
    
    # combined_vector = torch.cat([tf_one_hot, gene_one_hot, tf_embed, gene_embed, interaction], dim=0)
    # combined_vector = torch.cat([tf_one_hot, gene_one_hot, tf_embed, gene_embed], dim=0)
    # combined_vector = torch.cat([tf_embed, gene_embed, interaction], dim=0)
    combined_vector = torch.cat([tf_embed_nt, gene_embed_nt, tf_embed, gene_embed], dim=0)
    # combined_vector = torch.cat([torch.tensor([tf_id], dtype=torch.float32), torch.tensor([gene_id], dtype=torch.float32), tf_embed_nt, gene_embed_nt, tf_embed, gene_embed], dim=0)

    return combined_vector

def prepare_combined_dataset(loader):
    X_list = []
    y_list = []
    
    for batch_x, batch_y in loader:
        for i in range(len(batch_x)):
            item = batch_x[i]
            combined_vector = prepare_combined(item['tf_name'], item['gene_name'])
            X_list.append(combined_vector.numpy())
        
        y_list.extend(batch_y.numpy())
    
    X = np.array(X_list)
    y = np.array(y_list)
    return X, y

In [7]:
# Prepare training data
print("Preparing training data with combined one-hot and embeddings...")
X_train, y_train = prepare_combined_dataset(train_loader)
print(f"Train shape after combined: X={X_train.shape}, y={y_train.shape}")

# Prepare validation dataq
X_val, y_val = prepare_combined_dataset(validation_loader)
print(f"Validation shape after combined: X={X_val.shape}, y={y_val.shape}")

# Prepare test data
X_test, y_test = prepare_combined_dataset(test_loader)
print(f"Test shape after combined: X={X_test.shape}, y={y_test.shape}")

Preparing training data with combined one-hot and embeddings...
Train shape after combined: X=(10845, 682), y=(10845,)
Validation shape after combined: X=(2324, 682), y=(2324,)
Test shape after combined: X=(2325, 682), y=(2325,)


In [8]:
print(X_train[0])

[-1.00998580e+00 -3.12078029e-01  4.16774422e-01  1.77322403e-01
 -4.66763705e-01  3.98317352e-04  5.36288917e-02  2.15189517e-01
 -2.44093779e-02 -2.44461715e-01 -3.88215452e-01 -3.09721470e-01
 -1.17274024e-01  1.77829161e-01  5.15839495e-02  7.42390901e-02
  5.23093194e-02 -1.33010164e-01  1.88904449e-01 -1.03961878e-01
 -4.90405336e-02 -9.53843594e-02  6.24123216e-03  1.75259709e-01
  1.09703727e-01  7.15254154e-03  2.42361277e-01 -1.96593463e-01
 -9.60245505e-02  9.26277116e-02 -2.41084442e-01 -2.29819760e-01
 -8.97619575e-02  8.54402110e-02  5.27024418e-02 -6.79664165e-02
 -1.32730290e-01 -2.99905203e-02 -1.55082792e-01 -2.42106542e-01
 -1.48421749e-01 -9.46519449e-02  2.87010781e-02  1.94064267e-02
  6.11098818e-02 -8.67577046e-02 -2.46059727e-02 -5.96841760e-02
  1.01870988e-02  1.43336535e-01  5.58727980e-02 -9.91994590e-02
  1.54240176e-01 -8.06934163e-02 -2.59341151e-02 -1.05015919e-01
  1.98922567e-02 -9.22791213e-02  1.40779540e-01 -5.06159998e-02
 -8.51960480e-02 -7.38173

In [9]:
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score

import json
import pickle
import os

In [24]:
# Train Random Forest
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=300,        # number of trees
    max_depth=10,            # maximum depth of trees
    min_samples_split=18,     # minimum samples to split a node
    min_samples_leaf=5,      # minimum samples in leaf
    max_features='sqrt',
    random_state=10701,
    n_jobs=-1,               # use all CPU cores
    verbose=0,
    class_weight='balanced'  # handle class imbalance
)

rf_model.fit(X_train, y_train)
print("Training complete!")


Training Random Forest...
Training complete!


In [25]:
# Evaluate
for name, X, y in [('Train', X_train, y_train), 
                    ('Val', X_val, y_val), 
                    ('Test', X_test, y_test)]:
    y_pred = rf_model.predict(X)
    acc = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='macro')
    print(f"\n=== {name} Set ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Macro F1: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y, y_pred))


=== Train Set ===
Accuracy: 0.7152
Macro F1: 0.7111
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.77      0.71      3573
           1       0.75      0.88      0.81      3409
           2       0.75      0.52      0.61      3863

    accuracy                           0.72     10845
   macro avg       0.72      0.72      0.71     10845
weighted avg       0.72      0.72      0.71     10845


=== Val Set ===
Accuracy: 0.6080
Macro F1: 0.6004
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.65      0.61       766
           1       0.69      0.83      0.75       730
           2       0.55      0.37      0.44       828

    accuracy                           0.61      2324
   macro avg       0.60      0.62      0.60      2324
weighted avg       0.60      0.61      0.59      2324


=== Test Set ===
Accuracy: 0.6215
Macro F1: 0.6139
Classification Report:
             