In [101]:
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report
from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
import numpy as np
import matplotlib.pyplot as plt

In [102]:
df = pd.read_csv('../data/output_site_level_2.csv')
df = df.drop(columns='Unnamed: 0')

In [103]:
reads_df = df.rename(columns={
    'ID': 'transcript_id',
    'POS': 'transcript_position',
    'SEQ': '7mer'
})

In [104]:
def assign_set_type_by_gene(reads_df, split_ratios={'Train': 0.8, 'Val': 0.1, 'Test': 0.1}, random_state=42):
    """
    Assigns each row in reads_df a 'set_type' of Train, Val, or Test,
    ensuring all rows with the same gene_id are in the same set,
    and total number of rows (not just genes) in each set matches desired ratios.
    Label distribution is approximately balanced using a greedy strategy.
    """

    # Step 1: Get stats per gene
    gene_stats = (
        reads_df
        .groupby('gene_id')['label']
        .value_counts()
        .unstack(fill_value=0)
        .rename(columns={0: 'label_0', 1: 'label_1'})
        .reset_index()
    )
    gene_stats['total'] = gene_stats['label_0'] + gene_stats['label_1']

    # Shuffle genes for randomness
    gene_stats = gene_stats.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Step 2: Overall label distribution and target row counts
    total_rows = gene_stats['total'].sum()
    total_label_1 = gene_stats['label_1'].sum()
    overall_pos_rate = total_label_1 / total_rows

    target_rows = {k: total_rows * split_ratios[k] for k in split_ratios}

    # Step 3: Initialize bins
    bins = {
        'Train': {'genes': [], 'label_0': 0, 'label_1': 0, 'total': 0},
        'Val': {'genes': [], 'label_0': 0, 'label_1': 0, 'total': 0},
        'Test': {'genes': [], 'label_0': 0, 'label_1': 0, 'total': 0},
    }

    def pick_bin():
        # Find the bin with the biggest gap between current and target row count
        diffs = {k: target_rows[k] - bins[k]['total'] for k in bins}
        # Choose the bin that needs rows the most
        return max(diffs, key=diffs.get)

    # Step 4: Assign genes to bins to match row targets and label balance
    for _, row in gene_stats.iterrows():
        chosen_bin = pick_bin()
        bins[chosen_bin]['genes'].append(row['gene_id'])
        bins[chosen_bin]['label_0'] += row['label_0']
        bins[chosen_bin]['label_1'] += row['label_1']
        bins[chosen_bin]['total'] += row['total']

    # Step 5: Map gene_id â†’ set_type
    gene_to_set = {}
    for set_name, bin_data in bins.items():
        for gene_id in bin_data['genes']:
            gene_to_set[gene_id] = set_name

    reads_df['set_type'] = reads_df['gene_id'].map(gene_to_set)

    return reads_df

In [105]:
reads_df = assign_set_type_by_gene(reads_df)

set_counts = reads_df['set_type'].value_counts()
print("ðŸ“Š Number of rows in each set:")
for set_name, count in set_counts.items():
    print(f"  - {set_name}: {count} rows")

# Print label distribution per set (normalized)
label_distributions = reads_df.groupby('set_type')['label'].value_counts(normalize=True).unstack()

print("\nðŸ“ˆ Label distribution (percentage of label 0 and 1) in each set:")
for set_name in label_distributions.index:
    label_0_pct = label_distributions.loc[set_name].get(0, 0) * 100
    label_1_pct = label_distributions.loc[set_name].get(1, 0) * 100
    print(f"  - {set_name}:")
    print(f"      â€¢ Label 0: {label_0_pct:.2f}%")
    print(f"      â€¢ Label 1: {label_1_pct:.2f}%")

print("Ending: 3. Assign split bins")

ðŸ“Š Number of rows in each set:
  - Train: 97468 rows
  - Test: 12192 rows
  - Val: 12178 rows

ðŸ“ˆ Label distribution (percentage of label 0 and 1) in each set:
  - Test:
      â€¢ Label 0: 95.57%
      â€¢ Label 1: 4.43%
  - Train:
      â€¢ Label 0: 95.52%
      â€¢ Label 1: 4.48%
  - Val:
      â€¢ Label 0: 95.29%
      â€¢ Label 1: 4.71%
Ending: 3. Assign split bins


In [106]:
reads_df.head()

Unnamed: 0,gene_id,transcript_id,transcript_position,label,PreTime_mean,PreTime_median,PreTime_mode,PreTime_std,PreTime_min,PreTime_max,...,PostSD_p75,PostMean_mean,PostMean_median,PostMean_mode,PostMean_std,PostMean_min,PostMean_max,PostMean_p25,PostMean_p75,set_type
0,ENSG00000000003,ENST00000373020,512,0,0.007247,0.00599,0.00266,0.004404,0.00266,0.0169,...,1.875,92.33,92.55,92.3,2.258574,83.5,94.5,92.1,93.325,Train
1,ENSG00000000003,ENST00000373020,689,0,0.009868,0.00764,0.00531,0.006946,0.00232,0.0279,...,2.37,89.385714,89.6,90.4,2.113122,84.3,93.5,88.5,90.6,Train
2,ENSG00000000003,ENST00000373020,823,0,0.007456,0.00631,0.00598,0.003799,0.00299,0.0196,...,2.63,88.17619,88.2,84.7,2.420517,84.7,93.1,86.2,89.4,Train
3,ENSG00000000003,ENST00000373020,830,0,0.007765,0.00641,0.00498,0.004869,0.00266,0.0226,...,2.4925,80.11,80.0,78.7,1.52312,77.9,83.8,78.775,81.15,Train
4,ENSG00000000003,ENST00000373020,849,0,0.006785,0.00598,0.00398,0.003115,0.00332,0.0181,...,2.1,84.657143,84.4,82.9,2.048798,82.3,90.7,83.1,85.5,Train


In [107]:
print("Column Data Types:")
print(reads_df.dtypes)

# Display the number of rows
print("\nNumber of Rows:", len(reads_df))

Column Data Types:
gene_id                 object
transcript_id           object
transcript_position      int64
label                    int64
PreTime_mean           float64
                        ...   
PostMean_min           float64
PostMean_max           float64
PostMean_p25           float64
PostMean_p75           float64
set_type                object
Length: 77, dtype: object

Number of Rows: 121838


In [108]:
train_val_df = reads_df[reads_df["set_type"].isin(["Train", "Val"])].copy()
test_df      = reads_df[reads_df["set_type"] == "Test"].copy()

X_trainval = train_val_df.drop(columns=["gene_id", "transcript_id", "transcript_position", "label", "set_type"])
y_trainval = train_val_df["label"]

X_test = test_df.drop(columns=["gene_id", "transcript_id", "transcript_position", "label", "set_type"])
y_test = test_df["label"]

In [109]:
train_val_df.head()

Unnamed: 0,gene_id,transcript_id,transcript_position,label,PreTime_mean,PreTime_median,PreTime_mode,PreTime_std,PreTime_min,PreTime_max,...,PostSD_p75,PostMean_mean,PostMean_median,PostMean_mode,PostMean_std,PostMean_min,PostMean_max,PostMean_p25,PostMean_p75,set_type
0,ENSG00000000003,ENST00000373020,512,0,0.007247,0.00599,0.00266,0.004404,0.00266,0.0169,...,1.875,92.33,92.55,92.3,2.258574,83.5,94.5,92.1,93.325,Train
1,ENSG00000000003,ENST00000373020,689,0,0.009868,0.00764,0.00531,0.006946,0.00232,0.0279,...,2.37,89.385714,89.6,90.4,2.113122,84.3,93.5,88.5,90.6,Train
2,ENSG00000000003,ENST00000373020,823,0,0.007456,0.00631,0.00598,0.003799,0.00299,0.0196,...,2.63,88.17619,88.2,84.7,2.420517,84.7,93.1,86.2,89.4,Train
3,ENSG00000000003,ENST00000373020,830,0,0.007765,0.00641,0.00498,0.004869,0.00266,0.0226,...,2.4925,80.11,80.0,78.7,1.52312,77.9,83.8,78.775,81.15,Train
4,ENSG00000000003,ENST00000373020,849,0,0.006785,0.00598,0.00398,0.003115,0.00332,0.0181,...,2.1,84.657143,84.4,82.9,2.048798,82.3,90.7,83.1,85.5,Train


In [110]:
scaler = StandardScaler()
X_trainval_scaled = scaler.fit_transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

In [111]:
def build_model(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.4),

        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        # Dense(32, activation='relu'),
        # BatchNormalization(),
        # Dropout(0.2),

        # Dense(8, activation='relu'),

        Dense(16, activation='relu'),

        Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), #0.001
        loss='binary_crossentropy',
        metrics=[
        tf.keras.metrics.AUC(name='roc_auc'),
        tf.keras.metrics.AUC(curve='PR', name='pr_auc')
    ]
    )
    return model

In [112]:
# K = 5
# skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

# cv_auc = []
# cv_pr_auc = []

# for fold, (train_idx, val_idx) in enumerate(skf.split(X_trainval_scaled, y_trainval), 1):
#     print(f"\n===== Fold {fold}/{K} =====")

#     X_train, X_val = X_trainval_scaled[train_idx], X_trainval_scaled[val_idx]
#     y_train, y_val = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]

#     # Compute class weights for imbalance
#     classes = np.unique(y_train)
#     cw = compute_class_weight('balanced', classes=classes, y=y_train)
#     class_weights = dict(zip(classes, cw))


#     # Adjust ratio to be *less extreme* than fully balanced (e.g., 70â€“80% of computed weight)
#     # class_weights[1] = class_weights[1] * 0.5  # downscale minority class weight slightly
#     # class_weights[0] = class_weights[0] * 1.5  # upweight majority class mildly

#     model = build_model(X_train.shape[1])
#     early_stop = EarlyStopping(monitor='val_pr_auc', mode='max', patience=8, restore_best_weights=True)

#     history = model.fit(
#         X_train, y_train,
#         validation_data=(X_val, y_val),
#         epochs=100,
#         batch_size=32,
#         class_weight=class_weights,
#         callbacks=[early_stop],
#         verbose=0
#     )

#     y_val_prob = model.predict(X_val)
#     roc_auc = roc_auc_score(y_val, y_val_prob)
#     precision, recall, _ = precision_recall_curve(y_val, y_val_prob)
#     pr_auc = auc(recall, precision)

#     print(f"Fold {fold}: ROC-AUC={roc_auc:.4f}, PR-AUC={pr_auc:.4f}")
#     cv_auc.append(roc_auc)
#     cv_pr_auc.append(pr_auc)

# print("\n===== Cross-Validation Summary =====")
# print(f"Mean ROC-AUC: {np.mean(cv_auc):.4f} Â± {np.std(cv_auc):.4f}")
# print(f"Mean PR-AUC : {np.mean(cv_pr_auc):.4f} Â± {np.std(cv_pr_auc):.4f}")

In [113]:
classes = np.unique(y_trainval)
cw = compute_class_weight('balanced', classes=classes, y=y_trainval)
class_weights = dict(zip(classes, cw))

# Adjust ratio to be *less extreme* than fully balanced (e.g., 70â€“80% of computed weight)
# class_weights[1] = class_weights[1] * 0.5  # downscale minority class weight slightly
# class_weights[0] = class_weights[0] * 1.5  # upweight majority class mildly

early_stop = EarlyStopping(monitor='val_pr_auc', mode='max', patience=10, restore_best_weights=True)

final_model = build_model(X_trainval_scaled.shape[1])
final_model.fit(
    X_trainval_scaled, y_trainval,
    epochs=100,
    batch_size=32,
    class_weight=class_weights,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

y_pred_prob = final_model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int)

roc_auc = roc_auc_score(y_test, y_pred_prob)
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)

print(classification_report(y_test, y_pred, digits=4))
print(f"Test ROC-AUC: {roc_auc:.4f}")
print(f"Test PR-AUC : {pr_auc:.4f}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
              precision    recall  f1-score   support

           0     0.9907    0.8535    0.9170     11652
           1     0.2075    0.8278    0.3318       540

    accuracy                         0.8524     12192
   macro avg     0.5991    0.8406    0.6244     12192
weighted avg     0.9560    0.8524    0.8911     12192

Test ROC-AUC: 0.9039
Test PR-AUC : 0.3985


In [114]:
print(class_weights)

{0: 0.5235648594703517, 1: 11.109017223910842}


In [115]:
# Full model training

df_new = df.rename(columns={
    'ID': 'transcript_id',
    'POS': 'transcript_position',
    'SEQ': '7mer'
})

X_0 = df_new.drop(columns=["gene_id", "transcript_id", "transcript_position", "label"])
y_0 = df_new['label']

scaler_final = StandardScaler()
X_0_scaled = scaler_final.fit_transform(X_0)

classes = np.unique(y_0)
cw = compute_class_weight('balanced', classes=classes, y=y_0)
class_weights = dict(zip(classes, cw))

early_stop = EarlyStopping(monitor='val_pr_auc', mode='max', patience=10, restore_best_weights=True)

final_model = build_model(X_0_scaled.shape[1])
final_model.fit(
    X_0_scaled, y_0,
    epochs=100,
    batch_size=32,
    class_weight=class_weights,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

y_0_prob = final_model.predict(X_0_scaled)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100


In [116]:
# final_model.save("../models/site_level_model.keras")

In [117]:
df_transcript = df_new[["transcript_id", "transcript_position"]]

df_transcript['score'] = y_0_prob

df_transcript.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transcript['score'] = y_0_prob


Unnamed: 0,transcript_id,transcript_position,score
0,ENST00000373020,512,0.022196
1,ENST00000373020,689,0.013801
2,ENST00000373020,823,0.008082
3,ENST00000373020,830,0.004683
4,ENST00000373020,849,0.007017


In [118]:
y_0_prob

array([[0.02219601],
       [0.01380131],
       [0.00808156],
       ...,
       [0.95560473],
       [0.60883033],
       [0.06212097]], dtype=float32)

In [119]:
df_0 = pd.read_csv(f"../data/data.info.labelled").drop(columns=['gene_id', 'label'])

df_0 = pd.merge(df_0, df_transcript, on=['transcript_id', 'transcript_position'], how='left')

In [120]:
df_0.head()

Unnamed: 0,transcript_id,transcript_position,score
0,ENST00000000233,244,0.052011
1,ENST00000000233,261,0.713535
2,ENST00000000233,316,0.048524
3,ENST00000000233,332,0.368976
4,ENST00000000233,368,0.156225


In [121]:
# df_transcript.to_csv(f"../intermediate_submissions/genetherapy_dataset0_2.csv", index=False)

In [122]:
roc_auc = roc_auc_score(y_0, y_0_prob)
precision, recall, _ = precision_recall_curve(y_0, y_0_prob)
pr_auc = auc(recall, precision)

print("Dataset 0 evaluation")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC : {pr_auc:.4f}")

Dataset 0 evaluation
ROC-AUC: 0.9335
PR-AUC : 0.4975
