### Helper Functions For Preprocessing Data

In [1]:
from helper_functions import *

### Preprocessing

Load Data

In [2]:
labels = pd.read_csv('../data/data.info.labelled')
labels.head(3)

Unnamed: 0,gene_id,transcript_id,transcript_position,label
0,ENSG00000004059,ENST00000000233,244,0
1,ENSG00000004059,ENST00000000233,261,0
2,ENSG00000004059,ENST00000000233,316,0


In [3]:
df = load_data_to_dataframe('../data/dataset0.json.gz')
df.head(3)

Unnamed: 0,transcript_id,position,sequence,reads
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0..."
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0...."
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0..."


Assign labels to the data

In [4]:
df = combine_data(df, labels)
df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0


Extract mean reads

In [5]:
df = extract_mean_reads(df)
df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123...."
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109..."
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105...."


Train test split by gene_id

In [6]:
# Train test split by gene id
from sklearn.model_selection import train_test_split
train_gene_ids, test_gene_ids = train_test_split(df['gene_id'].unique(), test_size=0.2, random_state=4262)

train_df = df[df['gene_id'].isin(train_gene_ids)].copy()
test_df = df[df['gene_id'].isin(test_gene_ids)].copy()

display(train_df.head(3), train_df.shape)
display(test_df.head(3), test_df.shape)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108..."
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6..."
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115...."


(97073, 7)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123...."
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109..."
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105...."


(24765, 7)

Scale mean reads

In [7]:
# Scale mean reads of train data first, then use the fitted scaler to scale test data
scaler, scaled_train_df = scale_mean_reads(train_df)
scaler, scaled_test_df = scale_mean_reads(test_df, scaler=scaler)

display(scaled_train_df.head(3))
display(scaled_test_df.head(3))

Scaler saved to ../artifacts/mean_reads_scaler.pkl


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108...","[-0.43079389735504386, -0.7090163040809951, -0..."
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6...","[0.49106279705280276, -0.1994143266757514, 0.6..."
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115....","[1.6531720821786586, 1.5288144651450055, 0.418..."


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123....","[0.08622580166837883, -0.06361783691765782, 1...."
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109...","[-0.8399181449731496, -0.5851533870472007, -0...."
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105....","[-0.30231932947163226, -0.7279854836073506, -0..."


Extract middle sequence

In [8]:
scaled_train_df_w_middle_seq = extract_middle_sequence(scaled_train_df)
scaled_test_df_w_middle_seq = extract_middle_sequence(scaled_test_df)

display(scaled_train_df_w_middle_seq.head(3))
display(scaled_test_df_w_middle_seq.head(3))

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108...","[-0.43079389735504386, -0.7090163040809951, -0...",AAACT
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6...","[0.49106279705280276, -0.1994143266757514, 0.6...",GGACC
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115....","[1.6531720821786586, 1.5288144651450055, 0.418...",GGACT


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123....","[0.08622580166837883, -0.06361783691765782, 1....",AGACC
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109...","[-0.8399181449731496, -0.5851533870472007, -0....",AAACT
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105....","[-0.30231932947163226, -0.7279854836073506, -0...",AAACA


One hot encode middle sequence (DRACH motif)

In [9]:
# Encode train data first, then use the fitted encoder to transform test data
encoder, ohe_train_df = one_hot_encode_DRACH(scaled_train_df_w_middle_seq)
encoder, ohe_test_df = one_hot_encode_DRACH(scaled_test_df_w_middle_seq, encoder=encoder)

display(ohe_train_df.head(3))
display(ohe_test_df.head(3))

DRACH Encoder saved to ../artifacts/drach_encoder.pkl


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence,middle_sequence_OHE
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108...","[-0.43079389735504386, -0.7090163040809951, -0...",AAACT,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6...","[0.49106279705280276, -0.1994143266757514, 0.6...",GGACC,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115....","[1.6531720821786586, 1.5288144651450055, 0.418...",GGACT,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence,middle_sequence_OHE
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123....","[0.08622580166837883, -0.06361783691765782, 1....",AGACC,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109...","[-0.8399181449731496, -0.5851533870472007, -0....",AAACT,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105....","[-0.30231932947163226, -0.7279854836073506, -0...",AAACA,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### SMOTE

Check label imbalance (FOR SMOTE)

In [18]:
# Proportion of positive labels in the training set
np.mean(ohe_train_df['label'])

0.04520309457830705

#### By GENE_ID

In [19]:
# Proportion of positive labels by gene_id
prop_gene = ohe_train_df.groupby('gene_id')['label'].mean()

# Number of zeros
num_zeros = len(prop_gene[prop_gene == 0])
print('Number of gene_id with no postive labels: ', num_zeros)
print('Proportion of gene_id with no positive labels: ', num_zeros / len(prop_gene))

Number of gene_id with no postive labels:  1885
Proportion of gene_id with no positive labels:  0.6118143459915611


In [20]:
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# Get the scaled mean reads and one-hot encoded middle sequence columns as dataframe
X_resampled_list = [] 
y_resampled_list = []
gene_id_list = []

for g_id, group in ohe_train_df.groupby('gene_id'):
    X = np.vstack(group['scaled_mean_reads'].values)
    y = group['label'].values
    
    if sum(y) < 10 or sum(y - 1) < 10:  # Skip resampling if the number of positive or negative labels is less than 10
        X_resampled_list.append(X)
        y_resampled_list.append(y)
        gene_id_list.extend([g_id] * len(y))  # Use g_id for the gene_id
        continue

    # Apply SMOTE to the scaled mean reads and labels
    smote = SMOTE(random_state=4262)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Append resampled data to lists
    X_resampled_list.append(X_resampled)
    y_resampled_list.append(y_resampled)
    gene_id_list.extend([g_id] * len(y_resampled))  # Use g_id for the gene_id

# Combine all resampled data into final arrays
X_resampled_gene = np.vstack(X_resampled_list)
y_resampled_gene = np.concatenate(y_resampled_list)

# Combine into a final DataFrame if needed
resampled_gene_df = pd.DataFrame({
    'scaled_mean_reads': list(X_resampled_gene),
    'gene_id': gene_id_list,
    'label': y_resampled_gene
})


In [21]:
# Proportion of positive labels in the resampled data
print('Proportion of positive labels in the resampled data:', np.mean(resampled_gene_df['label']))

# Proportion of positive labels by gene_id
prop_gene = resampled_gene_df.groupby('gene_id')['label'].mean()

# Number of zeros
num_zeros = len(prop_gene[prop_gene == 0])
print('Number of gene_id with no postive labels: ', num_zeros)
print('Proportion of gene_id with no positive labels: ', num_zeros / len(prop_gene))

Proportion of positive labels in the resampled data: 0.04520309457830705
Number of gene_id with no postive labels:  1885
Proportion of gene_id with no positive labels:  0.6118143459915611


#### By MIDDLE_SEQUENCE (DRACH)

In [22]:
# Proportion of positive labels by middle_sequence
prop_drach = ohe_train_df.groupby('middle_sequence')['label'].mean()

# Number of zeros
num_zeros = len(prop_drach[prop_drach == 0])
print('Number of DRACH motifs with no postive labels: ', num_zeros)
print('Proportion of DRACH motifs with no positive labels: ', num_zeros / len(prop_drach))

Number of DRACH motifs with no postive labels:  0
Proportion of DRACH motifs with no positive labels:  0.0


In [23]:
# Number of positive labels by middle_sequence
ohe_train_df[ohe_train_df['label'] == 1].groupby('middle_sequence')['label'].count().sort_values()

middle_sequence
TAACA       1
TAACC       2
AAACC      10
AAACA      32
AGACC      51
TAACT      53
TGACA      59
TGACC      79
GAACC      95
AGACA     111
GAACA     136
AAACT     209
TGACT     355
GGACC     417
AGACT     429
GGACA     550
GAACT     653
GGACT    1146
Name: label, dtype: int64

In [24]:
ohe_test_df[ohe_test_df['label'] == 1].groupby('middle_sequence')['label'].count().sort_values()

middle_sequence
AAACC      1
AAACA      6
AGACC      9
TGACA      9
TGACC     11
TAACT     18
GAACC     29
AGACA     36
AAACT     46
GAACA     51
TGACT     75
AGACT     96
GAACT    130
GGACC    135
GGACA    141
GGACT    294
Name: label, dtype: int64

Apply SMOTE to each group of DRACH motif

In [25]:
from imblearn.over_sampling import SMOTE

# Get the scaled mean reads and one-hot encoded middle sequence columns as dataframe
X_resampled_list = [] 
y_resampled_list = []
middle_sequence_list = []

for middle_seq, group in ohe_train_df.groupby('middle_sequence'):
    X = np.vstack(group['scaled_mean_reads'].values)
    y = group['label'].values
    
    if sum(y) < 10: # Skip if the number of positive labels is less than 10
        X_resampled_list.append(X)
        y_resampled_list.append(y)
        middle_sequence_list.extend([middle_seq] * len(y))
        continue

    # Apply SMOTE to the scaled mean reads and labels
    smote = SMOTE(random_state=4262)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Append resampled data and one-hot encoding to lists
    X_resampled_list.append(X_resampled)
    y_resampled_list.append(y_resampled)
    middle_sequence_list.extend([middle_seq] * len(y_resampled))

# Combine all resampled data into final arrays
X_resampled = np.vstack(X_resampled_list)
y_resampled = np.concatenate(y_resampled_list)

# Combine into a final DataFrame if needed
resampled_drach_df = pd.DataFrame({
    'scaled_mean_reads': list(X_resampled),
    'middle_sequence': middle_sequence_list,
    'label': y_resampled
})

In [26]:
# Proportion of positive labels in the resampled data
print('Proportion of positive labels in the resampled data: ', np.mean(resampled_drach_df['label']))

# Proportion of positive labels by middle_sequence
prop_drach_resampled = resampled_drach_df.groupby('middle_sequence')['label'].mean()
prop_drach_resampled

Proportion of positive labels in the resampled data:  0.483202765619337


middle_sequence
AAACA    0.500000
AAACC    0.500000
AAACT    0.500000
AGACA    0.500000
AGACC    0.500000
AGACT    0.500000
GAACA    0.500000
GAACC    0.500000
GAACT    0.500000
GGACA    0.500000
GGACC    0.500000
GGACT    0.500000
TAACA    0.000271
TAACC    0.000856
TAACT    0.500000
TGACA    0.500000
TGACC    0.500000
TGACT    0.500000
Name: label, dtype: float64

One hot encode DRACH motifs

In [28]:
encoder, resampled_drach_df = one_hot_encode_DRACH(resampled_drach_df, encoder)
resampled_drach_df.head(3)

Unnamed: 0,scaled_mean_reads,middle_sequence,label,middle_sequence_OHE
0,"[2.5249757619759317, -0.7835678231993425, -0.2...",AAACA,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.6928199968356539, -1.1676038905620205, -0.7...",AAACA,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[0.3732826597732115, -0.8372266235169966, -0.2...",AAACA,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Prepare data for model input

In [29]:
# SMOTE by middle_sequence
X_resampled_drach = prepare_for_model(resampled_drach_df)
y_resampled_drach = resampled_drach_df['label'].values

# Without SMOTE
X_train = prepare_for_model(ohe_train_df)
y_train = ohe_train_df['label'].values

# Prepare test data
X_test = prepare_for_model(ohe_test_df)
y_test = ohe_test_df['label'].values

In [30]:
# print the shapes
print('X_resampled_drach shape:', X_resampled_drach.shape)
print('y_resampled_drach shape:', y_resampled_drach.shape)
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_resampled_drach shape: (179345, 27)
y_resampled_drach shape: (179345,)
X_train shape: (97073, 27)
y_train shape: (97073,)
X_test shape: (24765, 27)
y_test shape: (24765,)


### NN model

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import AUC

def build_model(input_shape):
    """
    Define and compile neural network model.
    """
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(150, activation='relu'),
        Dropout(0.2),  # Dropout layer for regularization
        Dense(32, activation='relu'),
        Dropout(0.2),  # Another dropout layer
        Dense(1, activation='sigmoid')
    ])
    # Keep track of AUC-PR and AUC-ROC during training
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[
            AUC(curve='PR', name='auc_pr'),  # AUC-PR
            AUC(curve='ROC', name='auc_roc')  # AUC-ROC
        ]
    )
    return model

#### Train model with and without SMOTE

In [32]:
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import ModelCheckpoint

# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=4262)
best_val_auc_pr_smote = 0
best_val_auc_pr_no_smote = 0
BEST_MODEL_PATH_SMOTE = "../artifacts/best_model_with_smote.keras"
BEST_MODEL_PATH_NO_SMOTE = "../artifacts/best_model_without_smote.keras"

# Set checkpoint to save the best model (based on validation AUC-PR)
checkpoint_smote = ModelCheckpoint(
        BEST_MODEL_PATH_SMOTE,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

checkpoint_no_smote = ModelCheckpoint(
        BEST_MODEL_PATH_NO_SMOTE,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

SMOTE model

In [33]:
# Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_resampled_drach)):
    print(f"Training fold {fold + 1} with SMOTE")

    # Split the data with SMOTE for this fold
    X_train_smote, X_val_smote = X_resampled_drach[train_idx].copy(), X_resampled_drach[val_idx].copy()
    y_train_smote, y_val_smote = y_resampled_drach[train_idx].copy(), y_resampled_drach[val_idx].copy()

    # Initialize and compile the model
    model_smote = build_model(X_train_smote.shape[1])

    # Train the model on this fold with SMOTE data
    history_smote = model_smote.fit(
        X_train_smote, y_train_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_smote, y_val_smote),
        callbacks=[checkpoint_smote]
    )

    # Track the best validation AUC-PR across folds with SMOTE
    fold_best_auc_pr_smote = max(history_smote.history['val_auc_pr'])
    if fold_best_auc_pr_smote > best_val_auc_pr_smote:
        best_val_auc_pr_smote = fold_best_auc_pr_smote

Training fold 1 with SMOTE
Epoch 1/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - auc_pr: 0.6921 - auc_roc: 0.7192 - loss: 0.6040 - val_auc_pr: 0.8304 - val_auc_roc: 0.8472 - val_loss: 0.4916
Epoch 2/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.8160 - auc_roc: 0.8352 - loss: 0.4980 - val_auc_pr: 0.8637 - val_auc_roc: 0.8792 - val_loss: 0.4421
Epoch 3/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.8435 - auc_roc: 0.8610 - loss: 0.4627 - val_auc_pr: 0.8776 - val_auc_roc: 0.8915 - val_loss: 0.4206
Epoch 4/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.8569 - auc_roc: 0.8730 - loss: 0.4440 - val_auc_pr: 0.8880 - val_auc_roc: 0.9014 - val_loss: 0.4015
Epoch 5/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - auc_pr: 0.8651 - auc_roc: 0.8813 - loss: 0.4308 - val_auc_pr: 0.8902 - val_auc_roc: 0.

No SMOTE Model

In [34]:
# Cross-validation without SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1} without SMOTE")

    # Split the data without SMOTE for this fold
    X_train_no_smote, X_val_no_smote = X_train[train_idx].copy(), X_train[val_idx].copy()
    y_train_no_smote, y_val_no_smote = y_train[train_idx].copy(), y_train[val_idx].copy()

    # Initialize and compile the model
    model_no_smote = build_model(X_train_no_smote.shape[1])

    # Train the model on this fold without SMOTE data
    history_no_smote = model_no_smote.fit(
        X_train_no_smote, y_train_no_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_no_smote, y_val_no_smote),
        callbacks=[checkpoint_no_smote]
    )

    # Track the best validation AUC-PR across folds without SMOTE
    fold_best_auc_pr_no_smote = max(history_no_smote.history['val_auc_pr'])
    if fold_best_auc_pr_no_smote > best_val_auc_pr_no_smote:
        best_val_auc_pr_no_smote = fold_best_auc_pr_no_smote

Training fold 1 without SMOTE
Epoch 1/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - auc_pr: 0.1585 - auc_roc: 0.7597 - loss: 0.1709 - val_auc_pr: 0.3764 - val_auc_roc: 0.8639 - val_loss: 0.1335
Epoch 2/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - auc_pr: 0.3460 - auc_roc: 0.8465 - loss: 0.1373 - val_auc_pr: 0.3848 - val_auc_roc: 0.8664 - val_loss: 0.1333
Epoch 3/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - auc_pr: 0.3641 - auc_roc: 0.8636 - loss: 0.1330 - val_auc_pr: 0.4083 - val_auc_roc: 0.8768 - val_loss: 0.1265
Epoch 4/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - auc_pr: 0.3907 - auc_roc: 0.8594 - loss: 0.1335 - val_auc_pr: 0.4165 - val_auc_roc: 0.8817 - val_loss: 0.1254
Epoch 5/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - auc_pr: 0.3900 - auc_roc: 0.8698 - loss: 0.1288 - val_auc_pr: 0.4218 - val_auc_roc: 0

Get the metrics

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_curve, auc, average_precision_score
from tensorflow.keras.models import load_model

# Load the best model for both SMOTE and no-SMOTE versions
best_model_smote = load_model(BEST_MODEL_PATH_SMOTE)
best_model_no_smote = load_model(BEST_MODEL_PATH_NO_SMOTE)

# Generate predictions and probabilities for both models on the test data
threshold = 0.9 # Threshold for binary predictions

y_pred_smote = (best_model_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_smote = best_model_smote.predict(X_test).flatten()  # Probabilities

y_pred_no_smote = (best_model_no_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_no_smote = best_model_no_smote.predict(X_test).flatten()  # Probabilities

# Calculate metrics for SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_smote)
roc_auc_smote = roc_auc_score(y_test, y_proba_smote)
pr_auc_smote = auc(recall, precision)
accuracy_smote = accuracy_score(y_test, y_pred_smote)
average_precision_smote = average_precision_score(y_test, y_proba_smote)

# Calculate metrics for No-SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_no_smote)
roc_auc_no_smote = roc_auc_score(y_test, y_proba_no_smote)
pr_auc_no_smote = auc(recall, precision)
accuracy_no_smote = accuracy_score(y_test, y_pred_no_smote)
average_precision_no_smote = average_precision_score(y_test, y_proba_no_smote)

# Print comparison results
print("Metrics Comparison on Test Data:")
print("\nWith SMOTE:")
print(f"AUC-ROC: {roc_auc_smote:.4f}")
print(f"AUC-PR: {pr_auc_smote:.4f}")
print(f"Accuracy: {accuracy_smote:.4f}")
print(f"Average Precision Score: {average_precision_smote:.4f}")

print("\nWithout SMOTE:")
print(f"AUC-ROC: {roc_auc_no_smote:.4f}")
print(f"AUC-PR: {pr_auc_no_smote:.4f}")
print(f"Accuracy: {accuracy_no_smote:.4f}")
print(f"Average Precision Score: {average_precision_no_smote:.4f}")

[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 350us/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334us/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356us/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336us/step
Metrics Comparison on Test Data:

With SMOTE:
AUC-ROC: 0.8528
AUC-PR: 0.3488
Accuracy: 0.9580
Average Precision Score: 0.3493

Without SMOTE:
AUC-ROC: 0.8949
AUC-PR: 0.4714
Accuracy: 0.9561
Average Precision Score: 0.4718


### Improving Model 

SMOTE seems to be overfitting the data: try regularisers

In [None]:
#first layer 64

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_curve, auc, average_precision_score
from tensorflow.keras.models import load_model

# Load the best model for both SMOTE and no-SMOTE versions
best_model_smote = load_model(BEST_MODEL_PATH_SMOTE)
best_model_no_smote = load_model(BEST_MODEL_PATH_NO_SMOTE)

# Generate predictions and probabilities for both models on the test data
threshold = 0.9 # Threshold for binary predictions

y_pred_smote = (best_model_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_smote = best_model_smote.predict(X_test).flatten()  # Probabilities

y_pred_no_smote = (best_model_no_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_no_smote = best_model_no_smote.predict(X_test).flatten()  # Probabilities

# Calculate metrics for SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_smote)
roc_auc_smote = roc_auc_score(y_test, y_proba_smote)
pr_auc_smote = auc(recall, precision)
accuracy_smote = accuracy_score(y_test, y_pred_smote)
average_precision_smote = average_precision_score(y_test, y_proba_smote)

# Calculate metrics for No-SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_no_smote)
roc_auc_no_smote = roc_auc_score(y_test, y_proba_no_smote)
pr_auc_no_smote = auc(recall, precision)
accuracy_no_smote = accuracy_score(y_test, y_pred_no_smote)
average_precision_no_smote = average_precision_score(y_test, y_proba_no_smote)

# Print comparison results
print("Metrics Comparison on Test Data:")
print("\nWith SMOTE:")
print(f"AUC-ROC: {roc_auc_smote:.4f}")
print(f"AUC-PR: {pr_auc_smote:.4f}")
print(f"Accuracy: {accuracy_smote:.4f}")
print(f"Average Precision Score: {average_precision_smote:.4f}")

print("\nWithout SMOTE:")
print(f"AUC-ROC: {roc_auc_no_smote:.4f}")
print(f"AUC-PR: {pr_auc_no_smote:.4f}")
print(f"Accuracy: {accuracy_no_smote:.4f}")
print(f"Average Precision Score: {average_precision_no_smote:.4f}")

[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Metrics Comparison on Test Data:

With SMOTE:
AUC-ROC: 0.8274
AUC-PR: 0.2948
Accuracy: 0.9558
Average Precision Score: 0.2953

Without SMOTE:
AUC-ROC: 0.8915
AUC-PR: 0.4606
Accuracy: 0.9561
Average Precision Score: 0.4611


WITHOUT K FOLD

In [39]:
# without kfold
no_smote = build_model(X_train.shape[1])
no_smote.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

with_smote = build_model(X_resampled_drach.shape[1])
with_smote.fit(X_resampled_drach, y_resampled_drach, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - auc_pr: 0.1884 - auc_roc: 0.7745 - loss: 0.1664 - val_auc_pr: 0.3641 - val_auc_roc: 0.8716 - val_loss: 0.1319
Epoch 2/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - auc_pr: 0.3865 - auc_roc: 0.8702 - loss: 0.1311 - val_auc_pr: 0.3756 - val_auc_roc: 0.8741 - val_loss: 0.1353
Epoch 3/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - auc_pr: 0.4067 - auc_roc: 0.8660 - loss: 0.1319 - val_auc_pr: 0.3740 - val_auc_roc: 0.8662 - val_loss: 0.1335
Epoch 4/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - auc_pr: 0.4150 - auc_roc: 0.8771 - loss: 0.1260 - val_auc_pr: 0.3904 - val_auc_roc: 0.8755 - val_loss: 0.1290
Epoch 5/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - auc_pr: 0.4177 - auc_roc: 0.8829 - loss: 0.1256 - val_auc_pr: 0.4029 - val_auc_roc: 0.8822 - val_loss: 0.1276
Epoch

<keras.src.callbacks.history.History at 0x21a55216950>

In [40]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_curve, auc, average_precision_score

# Generate predictions and probabilities for both models on the test data
threshold = 0.9 # Threshold for binary predictions

y_pred_smote = (with_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_smote = with_smote.predict(X_test).flatten()  # Probabilities

y_pred_no_smote = (no_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_no_smote = no_smote.predict(X_test).flatten()  # Probabilities

# Calculate metrics for SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_smote)
roc_auc_smote = roc_auc_score(y_test, y_proba_smote)
pr_auc_smote = auc(recall, precision)
accuracy_smote = accuracy_score(y_test, y_pred_smote)
average_precision_smote = average_precision_score(y_test, y_proba_smote)

# Calculate metrics for No-SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_no_smote)
roc_auc_no_smote = roc_auc_score(y_test, y_proba_no_smote)
pr_auc_no_smote = auc(recall, precision)
accuracy_no_smote = accuracy_score(y_test, y_pred_no_smote)
average_precision_no_smote = average_precision_score(y_test, y_proba_no_smote)

# Print comparison results
print("Metrics Comparison on Test Data:")
print("\nWith SMOTE:")
print(f"AUC-ROC: {roc_auc_smote:.4f}")
print(f"AUC-PR: {pr_auc_smote:.4f}")
print(f"Accuracy: {accuracy_smote:.4f}")
print(f"Average Precision Score: {average_precision_smote:.4f}")

print("\nWithout SMOTE:")
print(f"AUC-ROC: {roc_auc_no_smote:.4f}")
print(f"AUC-PR: {pr_auc_no_smote:.4f}")
print(f"Accuracy: {accuracy_no_smote:.4f}")
print(f"Average Precision Score: {average_precision_no_smote:.4f}")

[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Metrics Comparison on Test Data:

With SMOTE:
AUC-ROC: 0.8190
AUC-PR: 0.2921
Accuracy: 0.9567
Average Precision Score: 0.2931

Without SMOTE:
AUC-ROC: 0.8943
AUC-PR: 0.4663
Accuracy: 0.9561
Average Precision Score: 0.4667
