In [132]:
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# df = pd.read_csv('../data/dataset0_all_seq.csv')

df = pd.read_csv('../data/dataset0_mca.csv')

if 'Unnamed: 0' in df.columns:
  df = df.drop(columns='Unnamed: 0')

In [134]:
df.columns

Index(['transcript_id', 'transcript_position', '7mer', 'label', 'set_type',
       'PreTime_mean', 'PreTime_median', 'PreTime_mode', 'PreTime_std',
       'PreTime_min', 'PreTime_max', 'PreTime_p25', 'PreTime_p75',
       'PreSD_mean', 'PreSD_median', 'PreSD_mode', 'PreSD_std', 'PreSD_min',
       'PreSD_max', 'PreSD_p25', 'PreSD_p75', 'PreMean_mean', 'PreMean_median',
       'PreMean_mode', 'PreMean_std', 'PreMean_min', 'PreMean_max',
       'PreMean_p25', 'PreMean_p75', 'InTime_mean', 'InTime_median',
       'InTime_mode', 'InTime_std', 'InTime_min', 'InTime_max', 'InTime_p25',
       'InTime_p75', 'InSD_mean', 'InSD_median', 'InSD_mode', 'InSD_std',
       'InSD_min', 'InSD_max', 'InSD_p25', 'InSD_p75', 'InMean_mean',
       'InMean_median', 'InMean_mode', 'InMean_std', 'InMean_min',
       'InMean_max', 'InMean_p25', 'InMean_p75', 'PostTime_mean',
       'PostTime_median', 'PostTime_mode', 'PostTime_std', 'PostTime_min',
       'PostTime_max', 'PostTime_p25', 'PostTime_p75', 'PostS

In [135]:
# Remove 7mer from df

if "7mer" in df.columns:
  df = df.drop(columns='7mer')

## Split df into trainval and test datasets

In [137]:
train_val_df = df[df["set_type"].isin(["Train", "Val"])].copy()
test_df      = df[df["set_type"] == "Test"].copy()

# X_trainval = train_val_df.drop(columns=["gene_id", "transcript_id", "transcript_position", "label", "set_type"])
X_trainval = train_val_df.drop(columns=["transcript_id", "transcript_position", "label", "set_type"])
y_trainval = train_val_df["label"]

# X_test = test_df.drop(columns=["gene_id", "transcript_id", "transcript_position", "label", "set_type"])
X_test = test_df.drop(columns=["transcript_id", "transcript_position", "label", "set_type"])
y_test = test_df["label"]

In [138]:
train_val_df.head()

Unnamed: 0,transcript_id,transcript_position,label,set_type,PreTime_mean,PreTime_median,PreTime_mode,PreTime_std,PreTime_min,PreTime_max,...,MCA_1,MCA_2,MCA_3,MCA_4,MCA_5,MCA_6,MCA_7,MCA_8,MCA_9,MCA_10
0,ENST00000000233,244,0,Train,0.008264,0.00697,0.00398,0.005399,0.00199,0.0339,...,-0.320798,-0.879412,0.093348,-0.141512,-0.056389,-0.551046,0.498671,-0.290484,-0.279216,-0.416974
1,ENST00000000233,261,0,Train,0.006609,0.00564,0.00498,0.003599,0.00199,0.0222,...,1.243669,-0.081412,-0.276204,-0.120959,-0.176572,0.474024,0.402844,-0.486998,0.364221,0.149256
2,ENST00000000233,316,0,Train,0.00757,0.00631,0.00498,0.004456,0.00232,0.0299,...,0.752708,-0.364812,0.160608,0.644947,0.438556,0.156574,-0.39263,0.012817,0.189489,-0.374444
3,ENST00000000233,332,0,Train,0.01062,0.00902,0.0102,0.006136,0.00232,0.037,...,-0.337457,-0.174406,0.103896,-0.548998,-0.154801,-0.370468,-0.687121,0.133164,0.323889,0.417547
4,ENST00000000233,368,0,Train,0.010701,0.00896,0.00398,0.007169,0.00199,0.0478,...,-0.47673,-0.321733,0.678899,0.125176,-0.368318,-0.753457,0.059573,-0.101502,-0.308624,0.47912


### Build model function

For dataset0_all_seq.csv (dataset0 with one-hot-encoded columns):
  - Dense layers of 128 (dropout 0.4, batchnorm), 64 (dropout 0.3, batchnorm), 16, 1
  - All dense layers are relu, final activation layer is sigmoid

For dataset0_mca1.csv (dataset0 with MCA columns):
  - Dense layers of 64 (dropout 0.2, batchnorm), 16 (dropout 0.2, batchnorm), 4 (dropout 0.2, batchnorm), 2, 1
  - All dense layers are relu, final activation layer is sigmoid


--------------------------------------------
- Loss function: binary crossentropy
- Optimizer function: Adam with learning rate of 0.001
- Metrics to evaluate on: roc_auc, pr_auc

In [None]:
# Add seed

def build_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.2),

        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(4, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(2, activation='relu'),

        Dense(1, activation='sigmoid')

        # # ==========================================================

        # Dense(128, activation='relu', input_shape=(input_dim,)),
        # BatchNormalization(),
        # Dropout(0.4),

        # Dense(64, activation='relu'),
        # BatchNormalization(),
        # Dropout(0.3),

        # Dense(16, activation='relu'),

        # Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=[
        tf.keras.metrics.AUC(name='roc_auc'),
        tf.keras.metrics.AUC(curve='PR', name='pr_auc')
    ]
    )
    return model

In [140]:
X_trainval.head()

Unnamed: 0,PreTime_mean,PreTime_median,PreTime_mode,PreTime_std,PreTime_min,PreTime_max,PreTime_p25,PreTime_p75,PreSD_mean,PreSD_median,...,MCA_1,MCA_2,MCA_3,MCA_4,MCA_5,MCA_6,MCA_7,MCA_8,MCA_9,MCA_10
0,0.008264,0.00697,0.00398,0.005399,0.00199,0.0339,0.00432,0.011,4.223784,3.73,...,-0.320798,-0.879412,0.093348,-0.141512,-0.056389,-0.551046,0.498671,-0.290484,-0.279216,-0.416974
1,0.006609,0.00564,0.00498,0.003599,0.00199,0.0222,0.00432,0.00797,3.216424,2.88,...,1.243669,-0.081412,-0.276204,-0.120959,-0.176572,0.474024,0.402844,-0.486998,0.364221,0.149256
2,0.00757,0.00631,0.00498,0.004456,0.00232,0.0299,0.0044,0.00963,2.940541,2.65,...,0.752708,-0.364812,0.160608,0.644947,0.438556,0.156574,-0.39263,0.012817,0.189489,-0.374444
3,0.01062,0.00902,0.0102,0.006136,0.00232,0.037,0.006475,0.0134,6.47635,5.73,...,-0.337457,-0.174406,0.103896,-0.548998,-0.154801,-0.370468,-0.687121,0.133164,0.323889,0.417547
4,0.010701,0.00896,0.00398,0.007169,0.00199,0.0478,0.00577,0.0139,6.415051,6.52,...,-0.47673,-0.321733,0.678899,0.125176,-0.368318,-0.753457,0.059573,-0.101502,-0.308624,0.47912


### K-fold cross validation

- Perform K-fold cross validation with n_splits=5
- Train and validation set is split using StratifiedKFold
- Standard scaler fitted on X_train, then transformed on X_train and X_val at each fold
- Compute class weights using X_train
- Early stopping is monitored val_pr_auc, if it does not improve within 8 epochs then perform early stopping
- Evaluate roc_auc and pr_auc of each fold, and at the end the average roc_auc and pr_auc of the 5 folds are taken alongside the standard deviation

In [141]:
K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

cv_auc = []
cv_pr_auc = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_trainval, y_trainval), 1):
    print(f"\n===== Fold {fold}/{K} =====")

    X_train, X_val = X_trainval.iloc[train_idx], X_trainval.iloc[val_idx]
    y_train, y_val = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]

    # Perform scaling at each fold fitting only on training set
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Compute class weights for imbalance
    classes = np.unique(y_train)
    cw = compute_class_weight('balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, cw))


    # Adjust ratio to be *less extreme* than fully balanced (e.g., 70–80% of computed weight)
    # class_weights[1] = class_weights[1] * 0.5  # downscale minority class weight slightly
    # class_weights[0] = class_weights[0] * 1.5  # upweight majority class mildly

    model = build_model(X_train.shape[1])
    early_stop = EarlyStopping(monitor='val_pr_auc', mode='max', patience=8, restore_best_weights=True)

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=32,
        class_weight=class_weights,
        callbacks=[early_stop],
        verbose=0
    )

    y_val_prob = model.predict(X_val)
    roc_auc = roc_auc_score(y_val, y_val_prob)
    precision, recall, _ = precision_recall_curve(y_val, y_val_prob)
    pr_auc = auc(recall, precision)

    print(f"Fold {fold}: ROC-AUC={roc_auc:.4f}, PR-AUC={pr_auc:.4f}")
    cv_auc.append(roc_auc)
    cv_pr_auc.append(pr_auc)

print("\n===== Cross-Validation Summary =====")
print(f"Mean ROC-AUC: {np.mean(cv_auc):.4f} ± {np.std(cv_auc):.4f}")
print(f"Mean PR-AUC : {np.mean(cv_pr_auc):.4f} ± {np.std(cv_pr_auc):.4f}")


===== Fold 1/5 =====
Fold 1: ROC-AUC=0.9077, PR-AUC=0.3709

===== Fold 2/5 =====
Fold 2: ROC-AUC=0.8871, PR-AUC=0.3614

===== Fold 3/5 =====
Fold 3: ROC-AUC=0.8977, PR-AUC=0.4072

===== Fold 4/5 =====
Fold 4: ROC-AUC=0.8297, PR-AUC=0.4118

===== Fold 5/5 =====
Fold 5: ROC-AUC=0.8437, PR-AUC=0.4062

===== Cross-Validation Summary =====
Mean ROC-AUC: 0.8732 ± 0.0308
Mean PR-AUC : 0.3915 ± 0.0210


## Train on entire trainval dataset, then evaluate on test set

- Fit standard scaler on X_trainval, then transform X_test
- Computes class weights from y_trainval
- Build and fit the model using X_trainval
  - Validation: 20%
- Evaluate fitted model on X_test_scaled
- Calculate roc_auc_score and pr_auc_score

In [142]:
scaler = StandardScaler()
X_trainval_scaled = scaler.fit_transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

In [143]:
classes = np.unique(y_trainval)
cw = compute_class_weight('balanced', classes=classes, y=y_trainval)
class_weights = dict(zip(classes, cw))

# Adjust ratio to be *less extreme* than fully balanced (e.g., 70–80% of computed weight)
# class_weights[1] = class_weights[1] * 0.5  # downscale minority class weight slightly
# class_weights[0] = class_weights[0] * 1.5  # upweight majority class mildly

early_stop = EarlyStopping(monitor='val_pr_auc', mode='max', patience=10, restore_best_weights=True)

final_model = build_model(X_trainval_scaled.shape[1])
final_model.fit(
    X_trainval_scaled, y_trainval,
    epochs=100,
    batch_size=32,
    class_weight=class_weights,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

y_pred_prob = final_model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int)

roc_auc = roc_auc_score(y_test, y_pred_prob)
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)

print(classification_report(y_test, y_pred, digits=4))
print(f"Test ROC-AUC: {roc_auc:.4f}")
print(f"Test PR-AUC : {pr_auc:.4f}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
              precision    recall  f1-score   support

           0     0.9901    0.8678    0.9249     12568
           1     0.2278    0.8180    0.3564       599

    accuracy                         0.8656     13167
   macro avg     0.6090    0.8429    0.6407     13167
weighted avg     0.9554    0.8656    0.8991     13167

Test ROC-AUC: 0.9050
Test PR-AUC : 0.5214


In [144]:
print(class_weights)

{0: 0.5234886073510284, 1: 11.143457752255948}


## Train on full dataset for leaderboard output

In [146]:
# Full model training

df_new = df.rename(columns={
    'ID': 'transcript_id',
    'POS': 'transcript_position',
    'SEQ': '7mer'
})

X_0 = df_new.drop(columns=["transcript_id", "transcript_position", "label", "set_type"])
X_settype = df_new["set_type"]
y_0 = df_new['label']

scaler_final = StandardScaler()
X_0_scaled = scaler_final.fit_transform(X_0)

classes = np.unique(y_0)
cw = compute_class_weight('balanced', classes=classes, y=y_0)
class_weights = dict(zip(classes, cw))

early_stop = EarlyStopping(monitor='val_pr_auc', mode='max', patience=10, restore_best_weights=True)

final_model = build_model(X_0_scaled.shape[1])
final_model.fit(
    X_0_scaled, y_0,
    epochs=100,
    batch_size=32,
    class_weight=class_weights,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

y_0_prob = final_model.predict(X_0_scaled)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


### Save the final model

In [None]:
# final_model.save("../models/site_level_model.keras")
# final_model.save("../models/site_level_model_mca1.keras")

### Checking predictions of final model on dataset0

In [147]:
df_transcript = df_new[["transcript_id", "transcript_position"]]

df_transcript['score'] = y_0_prob

df_transcript.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transcript['score'] = y_0_prob


Unnamed: 0,transcript_id,transcript_position,score
0,ENST00000000233,244,0.048203
1,ENST00000000233,261,0.31844
2,ENST00000000233,316,0.032588
3,ENST00000000233,332,0.462767
4,ENST00000000233,368,0.023408


### Save the dataset0 predictions to csv

In [None]:
# df_transcript.to_csv(f"../intermediate_submissions/genetherapy_dataset0_2.csv", index=False)

### Evaluate dataset0 results

In [149]:
roc_auc = roc_auc_score(y_0, y_0_prob)
precision, recall, _ = precision_recall_curve(y_0, y_0_prob)
pr_auc = auc(recall, precision)

print("Dataset 0 evaluation")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC : {pr_auc:.4f}")

Dataset 0 evaluation
ROC-AUC: 0.9426
PR-AUC : 0.5169
