### Helper Functions For Preprocessing Data

In [1]:
from helper_functions import *

### Preprocessing

Load Data

In [14]:
labels = pd.read_csv('data.info.labelled')
labels.head(3)

Unnamed: 0,gene_id,transcript_id,transcript_position,label
0,ENSG00000004059,ENST00000000233,244,0
1,ENSG00000004059,ENST00000000233,261,0
2,ENSG00000004059,ENST00000000233,316,0


In [None]:
df = load_data_to_dataframe('dataset0.json.gz')
df.head(3)

Assign labels to the data

In [16]:
df = combine_data(df, labels)
df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0


Extract mean reads

In [5]:
df = extract_mean_reads(df)
df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123...."
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109..."
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105...."


Train test split by gene_id

In [6]:
# Train test split by gene id
from sklearn.model_selection import train_test_split
train_gene_ids, test_gene_ids = train_test_split(df['gene_id'].unique(), test_size=0.2, random_state=4262)

train_df = df[df['gene_id'].isin(train_gene_ids)].copy()
test_df = df[df['gene_id'].isin(test_gene_ids)].copy()

Scale mean reads

In [7]:
# Scale mean reads of train data first
train_df = scale_mean_reads(train_df)
train_df.head(3)

Scaler saved to mean_reads_scaler.pkl


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108...","[-0.43079389735504386, -0.7090163040809951, -0..."
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6...","[0.49106279705280276, -0.1994143266757514, 0.6..."
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115....","[1.6531720821786586, 1.5288144651450055, 0.418..."


In [8]:
# Load the saved scaler to transform the test data
scaler = load_scaler()
test_df = scale_mean_reads(test_df, scaler=scaler)

Extract middle sequence

In [9]:
train_df = extract_middle_sequence(train_df)
train_df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108...","[-0.43079389735504386, -0.7090163040809951, -0...",AAACT
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6...","[0.49106279705280276, -0.1994143266757514, 0.6...",GGACC
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115....","[1.6531720821786586, 1.5288144651450055, 0.418...",GGACT


In [10]:
test_df = extract_middle_sequence(test_df)

One hot encode middle sequence (DRACH motif)

In [11]:
train_df = one_hot_encode_DRACH(train_df)
train_df.head(3)

DRACH Encoder saved to drach_encoder.pkl


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence,middle_sequence_OHE
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108...","[-0.43079389735504386, -0.7090163040809951, -0...",AAACT,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6...","[0.49106279705280276, -0.1994143266757514, 0.6...",GGACC,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115....","[1.6531720821786586, 1.5288144651450055, 0.418...",GGACT,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
# Load the saved DRACH encoder to transform the test data
encoder = load_DRACH_encoder()
test_df = one_hot_encode_DRACH(test_df, encoder=encoder)

### SMOTE

Check label imbalance (FOR SMOTE)

In [30]:
# Proportion of positive labels in the training set
np.mean(train_df['label'])

0.04520309457830705

#### By GENE_ID

In [31]:
# Proportion of positive labels by gene_id
prop_gene = train_df.groupby('gene_id')['label'].mean()

# Number of zeros
num_zeros = len(prop_gene[prop_gene == 0])
print('Number of gene_id with no postive labels: ', num_zeros)
print('Proportion of gene_id with no positive labels: ', num_zeros / len(prop_gene))

Number of gene_id with no postive labels:  1885
Proportion of gene_id with no positive labels:  0.6118143459915611


In [32]:
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# Get the scaled mean reads and one-hot encoded middle sequence columns as dataframe
X_resampled_list = [] 
y_resampled_list = []
gene_id_list = []

for g_id, group in train_df.groupby('gene_id'):
    X = np.vstack(group['scaled_mean_reads'].values)
    y = group['label'].values
    
    if sum(y) < 10 or sum(y - 1) < 10:  # Skip resampling if the number of positive or negative labels is less than 10
        X_resampled_list.append(X)
        y_resampled_list.append(y)
        gene_id_list.extend([g_id] * len(y))  # Use g_id for the gene_id
        continue

    # Apply SMOTE to the scaled mean reads and labels
    smote = SMOTE(random_state=4262)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Append resampled data to lists
    X_resampled_list.append(X_resampled)
    y_resampled_list.append(y_resampled)
    gene_id_list.extend([g_id] * len(y_resampled))  # Use g_id for the gene_id

# Combine all resampled data into final arrays
X_resampled_gene = np.vstack(X_resampled_list)
y_resampled_gene = np.concatenate(y_resampled_list)

# Combine into a final DataFrame if needed
resampled_gene_df = pd.DataFrame({
    'scaled_mean_reads': list(X_resampled_gene),
    'gene_id': gene_id_list,
    'label': y_resampled_gene
})


In [33]:
# Proportion of positive labels in the resampled data
print('Proportion of positive labels in the resampled data:', np.mean(resampled_gene_df['label']))

# Proportion of positive labels by gene_id
prop_gene = resampled_gene_df.groupby('gene_id')['label'].mean()

# Number of zeros
num_zeros = len(prop_gene[prop_gene == 0])
print('Number of gene_id with no postive labels: ', num_zeros)
print('Proportion of gene_id with no positive labels: ', num_zeros / len(prop_gene))

Proportion of positive labels in the resampled data: 0.04520309457830705
Number of gene_id with no postive labels:  1885
Proportion of gene_id with no positive labels:  0.6118143459915611


#### By MIDDLE_SEQUENCE (DRACH)

In [34]:
# Proportion of positive labels by middle_sequence
prop_drach = train_df.groupby('middle_sequence')['label'].mean()

# Number of zeros
num_zeros = len(prop_drach[prop_drach == 0])
print('Number of DRACH motifs with no postive labels: ', num_zeros)
print('Proportion of DRACH motifs with no positive labels: ', num_zeros / len(prop_drach))

Number of DRACH motifs with no postive labels:  0
Proportion of DRACH motifs with no positive labels:  0.0


In [None]:
# Number of positive labels by middle_sequence
train_df[train_df['label'] == 1].groupby('middle_sequence')['label'].count().sort_values()

middle_sequence
TAACA       1
TAACC       2
AAACC      10
AAACA      32
AGACC      51
TAACT      53
TGACA      59
TGACC      79
GAACC      95
AGACA     111
GAACA     136
AAACT     209
TGACT     355
GGACC     417
AGACT     429
GGACA     550
GAACT     653
GGACT    1146
Name: label, dtype: int64

In [75]:
test_df[test_df['label'] == 1].groupby('middle_sequence')['label'].count().sort_values()

middle_sequence
AAACC      1
AAACA      6
AGACC      9
TGACA      9
TGACC     11
TAACT     18
GAACC     29
AGACA     36
AAACT     46
GAACA     51
TGACT     75
AGACT     96
GAACT    130
GGACC    135
GGACA    141
GGACT    294
Name: label, dtype: int64

Apply SMOTE to each group of DRACH motif

In [36]:
from imblearn.over_sampling import SMOTE

# Get the scaled mean reads and one-hot encoded middle sequence columns as dataframe
X_resampled_list = [] 
y_resampled_list = []
middle_sequence_list = []

for middle_seq, group in train_df.groupby('middle_sequence'):
    X = np.vstack(group['scaled_mean_reads'].values)
    y = group['label'].values
    
    if sum(y) < 10: # Skip if the number of positive labels is less than 10
        X_resampled_list.append(X)
        y_resampled_list.append(y)
        middle_sequence_list.extend([middle_seq] * len(y))
        continue

    # Apply SMOTE to the scaled mean reads and labels
    smote = SMOTE(random_state=4262)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Append resampled data and one-hot encoding to lists
    X_resampled_list.append(X_resampled)
    y_resampled_list.append(y_resampled)
    middle_sequence_list.extend([middle_seq] * len(y_resampled))

# Combine all resampled data into final arrays
X_resampled = np.vstack(X_resampled_list)
y_resampled = np.concatenate(y_resampled_list)

# Combine into a final DataFrame if needed
resampled_drach_df = pd.DataFrame({
    'scaled_mean_reads': list(X_resampled),
    'middle_sequence': middle_sequence_list,
    'label': y_resampled
})

In [37]:
# Proportion of positive labels in the resampled data
print('Proportion of positive labels in the resampled data: ', np.mean(resampled_drach_df['label']))

# Proportion of positive labels by middle_sequence
prop_drach_resampled = resampled_drach_df.groupby('middle_sequence')['label'].mean()
prop_drach_resampled

Proportion of positive labels in the resampled data:  0.483202765619337


middle_sequence
AAACA    0.500000
AAACC    0.500000
AAACT    0.500000
AGACA    0.500000
AGACC    0.500000
AGACT    0.500000
GAACA    0.500000
GAACC    0.500000
GAACT    0.500000
GGACA    0.500000
GGACC    0.500000
GGACT    0.500000
TAACA    0.000271
TAACC    0.000856
TAACT    0.500000
TGACA    0.500000
TGACC    0.500000
TGACT    0.500000
Name: label, dtype: float64

One hot encode DRACH motifs

In [40]:
resampled_drach_df = one_hot_encode_DRACH(resampled_drach_df, encoder)
resampled_drach_df.head(3)

Unnamed: 0,scaled_mean_reads,middle_sequence,label,middle_sequence_OHE
0,"[2.5249757619759317, -0.7835678231993425, -0.2...",AAACA,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.6928199968356539, -1.1676038905620205, -0.7...",AAACA,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[0.3732826597732115, -0.8372266235169966, -0.2...",AAACA,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Prepare data for model input

In [63]:
# SMOTE by middle_sequence
X_resampled_drach = prepare_for_model(resampled_drach_df)
y_resampled_drach = resampled_drach_df['label'].values

# Without SMOTE
X_train = prepare_for_model(train_df)
y_train = train_df['label'].values

# Prepare test data
X_test = prepare_for_model(test_df)
y_test = test_df['label'].values

In [68]:
# print the shapes
print('X_resampled_drach shape:', X_resampled_drach.shape)
print('y_resampled_drach shape:', y_resampled_drach.shape)
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_resampled_drach shape: (179345, 27)
y_resampled_drach shape: (179345,)
X_train shape: (97073, 27)
y_train shape: (97073,)
X_test shape: (24765, 27)
y_test shape: (24765,)


### NN model

In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import AUC

def build_model(input_shape):
    """
    Define and compile neural network model.
    """
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(150, activation='relu'),
        Dropout(0.2),  # Dropout layer for regularization
        Dense(32, activation='relu'),
        Dropout(0.2),  # Another dropout layer
        Dense(1, activation='sigmoid')
    ])
    # Keep track of AUC-PR and AUC-ROC during training
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[
            AUC(curve='PR', name='auc_pr'),  # AUC-PR
            AUC(curve='ROC', name='auc_roc')  # AUC-ROC
        ]
    )
    return model

#### Train model with and without SMOTE

In [50]:
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import ModelCheckpoint

# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=4262)
best_val_auc_pr_smote = 0
best_val_auc_pr_no_smote = 0
best_model_path_smote = 'best_model_with_smote.keras'
best_model_path_no_smote = 'best_model_without_smote.keras'

# Set checkpoint to save the best model (based on validation AUC-PR)
checkpoint_smote = ModelCheckpoint(
        best_model_path_smote,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

checkpoint_no_smote = ModelCheckpoint(
        best_model_path_no_smote,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

SMOTE model

In [69]:
# Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_resampled_drach)):
    print(f"Training fold {fold + 1} with SMOTE")

    # Split the data with SMOTE for this fold
    X_train_smote, X_val_smote = X_resampled_drach[train_idx].copy(), X_resampled_drach[val_idx].copy()
    y_train_smote, y_val_smote = y_resampled_drach[train_idx].copy(), y_resampled_drach[val_idx].copy()

    # Initialize and compile the model
    model_smote = build_model(X_train_smote.shape[1])

    # Train the model on this fold with SMOTE data
    history_smote = model_smote.fit(
        X_train_smote, y_train_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_smote, y_val_smote),
        callbacks=[checkpoint_smote]
    )

    # Track the best validation AUC-PR across folds with SMOTE
    fold_best_auc_pr_smote = max(history_smote.history['val_auc_pr'])
    if fold_best_auc_pr_smote > best_val_auc_pr_smote:
        best_val_auc_pr_smote = fold_best_auc_pr_smote

Training fold 1 with SMOTE
Epoch 1/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - auc_pr: 0.7265 - auc_roc: 0.7499 - loss: 0.5798 - val_auc_pr: 0.8480 - val_auc_roc: 0.8624 - val_loss: 0.4625
Epoch 2/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.8422 - auc_roc: 0.8565 - loss: 0.4675 - val_auc_pr: 0.8802 - val_auc_roc: 0.8922 - val_loss: 0.4161
Epoch 3/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.8672 - auc_roc: 0.8820 - loss: 0.4284 - val_auc_pr: 0.8922 - val_auc_roc: 0.9059 - val_loss: 0.3912
Epoch 4/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.8800 - auc_roc: 0.8939 - loss: 0.4080 - val_auc_pr: 0.9040 - val_auc_roc: 0.9129 - val_loss: 0.3751
Epoch 5/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - auc_pr: 0.8890 - auc_roc: 0.9034 - loss: 0.3908 - val_auc_pr: 0.9105 - val_auc_roc: 0.

No SMOTE Model

In [52]:
# Cross-validation without SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1} without SMOTE")

    # Split the data without SMOTE for this fold
    X_train_no_smote, X_val_no_smote = X_train[train_idx].copy(), X_train[val_idx].copy()
    y_train_no_smote, y_val_no_smote = y_train[train_idx].copy(), y_train[val_idx].copy()

    # Initialize and compile the model
    model_no_smote = build_model(X_train_no_smote.shape[1])

    # Train the model on this fold without SMOTE data
    history_no_smote = model_no_smote.fit(
        X_train_no_smote, y_train_no_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_no_smote, y_val_no_smote),
        callbacks=[checkpoint_no_smote]
    )

    # Track the best validation AUC-PR across folds without SMOTE
    fold_best_auc_pr_no_smote = max(history_no_smote.history['val_auc_pr'])
    if fold_best_auc_pr_no_smote > best_val_auc_pr_no_smote:
        best_val_auc_pr_no_smote = fold_best_auc_pr_no_smote

Training fold 1 without SMOTE
Epoch 1/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - auc_pr: 0.1864 - auc_roc: 0.7835 - loss: 0.1675 - val_auc_pr: 0.3838 - val_auc_roc: 0.8717 - val_loss: 0.1318
Epoch 2/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - auc_pr: 0.3674 - auc_roc: 0.8603 - loss: 0.1317 - val_auc_pr: 0.4032 - val_auc_roc: 0.8769 - val_loss: 0.1284
Epoch 3/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - auc_pr: 0.4045 - auc_roc: 0.8737 - loss: 0.1288 - val_auc_pr: 0.4157 - val_auc_roc: 0.8763 - val_loss: 0.1271
Epoch 4/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - auc_pr: 0.4038 - auc_roc: 0.8688 - loss: 0.1294 - val_auc_pr: 0.4259 - val_auc_roc: 0.8803 - val_loss: 0.1246
Epoch 5/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - auc_pr: 0.4236 - auc_roc: 0.8805 - loss: 0.1265 - val_auc_pr: 0.4343 - val_auc_roc: 0

Get the metrics

In [71]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_curve, auc, average_precision_score
from tensorflow.keras.models import load_model

# Load the best model for both SMOTE and no-SMOTE versions
best_model_smote = load_model(best_model_path_smote)
best_model_no_smote = load_model(best_model_path_no_smote)

# Generate predictions and probabilities for both models on the test data
threshold = 0.9 # Threshold for binary predictions

y_pred_smote = (best_model_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_smote = best_model_smote.predict(X_test).flatten()  # Probabilities

y_pred_no_smote = (best_model_no_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_no_smote = best_model_no_smote.predict(X_test).flatten()  # Probabilities

# Calculate metrics for SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_smote)
roc_auc_smote = roc_auc_score(y_test, y_proba_smote)
pr_auc_smote = auc(recall, precision)
accuracy_smote = accuracy_score(y_test, y_pred_smote)
average_precision_smote = average_precision_score(y_test, y_proba_smote)

# Calculate metrics for No-SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_no_smote)
roc_auc_no_smote = roc_auc_score(y_test, y_proba_no_smote)
pr_auc_no_smote = auc(recall, precision)
accuracy_no_smote = accuracy_score(y_test, y_pred_no_smote)
average_precision_no_smote = average_precision_score(y_test, y_proba_no_smote)

# Print comparison results
print("Metrics Comparison on Test Data:")
print("\nWith SMOTE:")
print(f"AUC-ROC: {roc_auc_smote:.4f}")
print(f"AUC-PR: {pr_auc_smote:.4f}")
print(f"Accuracy: {accuracy_smote:.4f}")
print(f"Average Precision Score: {average_precision_smote:.4f}")

print("\nWithout SMOTE:")
print(f"AUC-ROC: {roc_auc_no_smote:.4f}")
print(f"AUC-PR: {pr_auc_no_smote:.4f}")
print(f"Accuracy: {accuracy_no_smote:.4f}")
print(f"Average Precision Score: {average_precision_no_smote:.4f}")

[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Metrics Comparison on Test Data:

With SMOTE:
AUC-ROC: 0.8528
AUC-PR: 0.3382
Accuracy: 0.9577
Average Precision Score: 0.3389

Without SMOTE:
AUC-ROC: 0.8908
AUC-PR: 0.4596
Accuracy: 0.9561
Average Precision Score: 0.4601


### Improving Model 

SMOTE seems to be overfitting the data: try regularisers