### Helper Functions For Preprocessing Data

In [1]:
import gzip
import json
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def load_json_gz_to_dataframe(file_path):
    """
    Load gzipped JSON data into a DataFrame.
    """
    data = []
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            json_data = json.loads(line)
            for transcript, positions in json_data.items():
                for position, sequences in positions.items():
                    position = int(position)
                    for sequence, reads in sequences.items():
                        data.append({
                            'transcript_id': transcript,
                            'position': position,
                            'sequence': sequence,
                            'reads': reads
                        })
    return pd.DataFrame(data)

def extract_mean_reads(dataset):
    """
    Compute the mean of 'reads' for each row.
    """
    dataset['mean_reads'] = dataset['reads'].apply(lambda x: np.mean(x, axis=0))
    return dataset

def scale_mean_reads(dataset, scaler=None, scaler_path='mean_reads_scaler.pkl'):
    """
    Scale the 'mean_reads' column using StandardScaler.
    """
    if scaler is None: # If no scaler is provided, fit a new one and save it
        scaler = StandardScaler()
        scaled_mean_reads = scaler.fit_transform(np.vstack(dataset['mean_reads'].values))
        dataset['scaled_mean_reads'] = list(scaled_mean_reads)
        joblib.dump(scaler, scaler_path)
        print('Scaler saved to', scaler_path)
    else: # Use the provided scaler to transform the data
        scaled_mean_reads = scaler.transform(np.vstack(dataset['mean_reads'].values))
        dataset['scaled_mean_reads'] = list(scaled_mean_reads)
    return dataset

def load_scaler(scaler_path='mean_reads_scaler.pkl'):
    """
    Load the saved scaler from the given path.
    """
    return joblib.load(scaler_path)

def drach_encoder():
    """
    Return a OneHotEncoder object with predefined DRACH motifs.
    """
    # Define DRACH motifs to be used for one-hot encoding
    D, R, A, C, H = ['A', 'G', 'T'], ['A', 'G'], ['A'], ['C'], ['A', 'C', 'T']
    drach_motifs = [d + r + a + c + h for d in D for r in R for a in A for c in C for h in H]
    encoder = OneHotEncoder(categories=[drach_motifs], handle_unknown='ignore')
    return encoder

def extract_middle_sequence(dataset):
    """
    Extract the middle 5-mers sequence from the 'sequence' column.
    """
    dataset['middle_sequence'] = dataset['sequence'].apply(lambda x: x[1:-1])
    return dataset

def one_hot_encode_DRACH(dataset, encoder=None, encoder_path='drach_encoder.pkl'):
    """
    Apply one-hot encoding to the middle 5-mers sequence
    """
    # One-hot encode the middle sequence
    if encoder is None: # If no encoder is provided, fit a new one and save it
        encoder = drach_encoder()
        one_hot_matrix = encoder.fit_transform(dataset[['middle_sequence']])
        joblib.dump(encoder, encoder_path)
        print('DRACH Encoder saved to', encoder_path)
    else:
        one_hot_matrix = encoder.transform(dataset[['middle_sequence']])
    dataset['middle_sequence_OHE'] = list(one_hot_matrix.toarray())
    return dataset

def load_DRACH_encoder(encoder_path='drach_encoder.pkl'):
    """
    Load the saved DRACH encoder from the given path.
    """
    return joblib.load(encoder_path)

def combine_data(dataset, labels):
    """
    Combine dataset with labels
    """
    # Left join dataset with labels on 'transcript_id' and 'position'
    merged_df = pd.merge(dataset, labels,
                         left_on=['transcript_id', 'position'],
                         right_on=['transcript_id', 'transcript_position'],
                         how='left')
    # Reorder gene_id to the first column and drop duplicate columns
    gene_id = merged_df['gene_id']
    merged_df = merged_df.drop(columns=['transcript_position', 'gene_id'])
    merged_df.insert(0, 'gene_id', gene_id)
    return merged_df

def prepare_for_model(dataset):
    """
    Combine 'scaled_mean_reads' and `middle_sequence_OHE` for model input.
    """
    combined_features = np.hstack([np.vstack(dataset['scaled_mean_reads']), np.vstack(dataset['middle_sequence_OHE'])])
    return combined_features

### Preprocessing

Load Data

In [2]:
labels = pd.read_csv('data.info.labelled')
labels.head(3)

Unnamed: 0,gene_id,transcript_id,transcript_position,label
0,ENSG00000004059,ENST00000000233,244,0
1,ENSG00000004059,ENST00000000233,261,0
2,ENSG00000004059,ENST00000000233,316,0


In [5]:
df = load_json_gz_to_dataframe('dataset0.json.gz')
df.head(3)

Unnamed: 0,transcript_id,position,sequence,reads
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0..."
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0...."
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0..."


Assign labels to the data

In [6]:
df = combine_data(df, labels)
df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0


Extract mean reads

In [7]:
df = extract_mean_reads(df)
df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123...."
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109..."
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105...."


Train test split by gene_id

In [8]:
# Train test split by gene id
from sklearn.model_selection import train_test_split
train_gene_ids, test_gene_ids = train_test_split(df['gene_id'].unique(), test_size=0.2, random_state=4262)

train_df = df[df['gene_id'].isin(train_gene_ids)].copy()
test_df = df[df['gene_id'].isin(test_gene_ids)].copy()

Scale mean reads

In [9]:
# Scale mean reads of train data first
train_df = scale_mean_reads(train_df)
train_df.head(3)

Scaler saved to mean_reads_scaler.pkl


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108...","[-0.43079389735504386, -0.7090163040809951, -0..."
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6...","[0.49106279705280276, -0.1994143266757514, 0.6..."
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115....","[1.6531720821786586, 1.5288144651450055, 0.418..."


In [10]:
# Load the saved scaler to transform the test data
scaler = load_scaler()
test_df = scale_mean_reads(test_df, scaler=scaler)

Extract middle sequence

In [11]:
train_df = extract_middle_sequence(train_df)
train_df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108...","[-0.43079389735504386, -0.7090163040809951, -0...",AAACT
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6...","[0.49106279705280276, -0.1994143266757514, 0.6...",GGACC
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115....","[1.6531720821786586, 1.5288144651450055, 0.418...",GGACT


In [12]:
test_df = extract_middle_sequence(test_df)

One hot encode middle sequence (DRACH motif)

In [13]:
train_df = one_hot_encode_DRACH(train_df)
train_df.head(3)

DRACH Encoder saved to drach_encoder.pkl


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence,middle_sequence_OHE
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0,"[0.007340399999999998, 2.9771799999999997, 108...","[-0.43079389735504386, -0.7090163040809951, -0...",AAACT,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0,"[0.00898787234042553, 3.961489361702128, 118.6...","[0.49106279705280276, -0.1994143266757514, 0.6...",GGACC,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0,"[0.011064705882352935, 7.299607843137254, 115....","[1.6531720821786586, 1.5288144651450055, 0.418...",GGACT,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [14]:
# Load the saved DRACH encoder to transform the test data
encoder = load_DRACH_encoder()
test_df = one_hot_encode_DRACH(test_df, encoder=encoder)

### SMOTE

Check label imbalance (FOR SMOTE)

In [15]:
# Proportion of positive labels in the training set
np.mean(train_df['label'])

0.04520309457830705

GENE_ID

In [51]:
# Proportion of positive labels by gene_id
prop_gene_id = train_df.groupby('gene_id')['label'].mean()

# Number of zeros
num_zeros = len(prop_gene_id[prop_gene_id == 0])
print('Number of gene_id with no postive labels: ', num_zeros)
print('Proportion of gene_id with no positive labels: ', num_zeros / len(prop_gene_id))

Number of gene_id with no postive labels:  1885
Proportion of gene_id with no positive labels:  0.6118143459915611


In [61]:
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# Get the scaled mean reads and one-hot encoded middle sequence columns as dataframe
X_resampled_list = [] 
y_resampled_list = []
gene_id_list = []

for g_id, group in train_df.groupby('gene_id'):
    X = np.vstack(group['scaled_mean_reads'].values)
    y = group['label'].values
    
    if sum(y) < 10 or sum(y - 1) < 10:  # Skip resampling if the number of positive labels is less than 10
        X_resampled_list.append(X)
        y_resampled_list.append(y)
        gene_id_list.extend([g_id] * len(y))  # Use g_id for the gene_id
        continue

    # Apply SMOTE to the scaled mean reads and labels
    smote = SMOTE(random_state=4262)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Append resampled data to lists
    X_resampled_list.append(X_resampled)
    y_resampled_list.append(y_resampled)
    gene_id_list.extend([g_id] * len(y_resampled))  # Use g_id for the gene_id

# Combine all resampled data into final arrays
X_resampled_gene = np.vstack(X_resampled_list)
y_resampled_gene = np.concatenate(y_resampled_list)

# Combine into a final DataFrame if needed
resampled_gene_df = pd.DataFrame({
    'scaled_mean_reads': list(X_resampled_gene),
    'gene_id': gene_id_list,
    'label': y_resampled_gene
})


In [64]:
# Proportion of positive labels in the resampled data
print('Proportion of positive labels in the resampled data:', np.mean(resampled_gene_df['label']))

# Proportion of positive labels by gene_id
prop_gene_id = resampled_gene_df.groupby('gene_id')['label'].mean()

# Number of zeros
num_zeros = len(prop_gene_id[prop_gene_id == 0])
print('Number of gene_id with no postive labels: ', num_zeros)
print('Proportion of gene_id with no positive labels: ', num_zeros / len(prop_gene_id))

Proportion of positive labels in the resampled data: 0.04520309457830705
Number of gene_id with no postive labels:  1885
Proportion of gene_id with no positive labels:  0.6118143459915611


MIDDLE_SEQUENCE (DRACH)

In [56]:
# Proportion of positive labels by middle_sequence
prop_drach = train_df.groupby('middle_sequence')['label'].mean()

# Number of zeros
num_zeros = len(prop_drach[prop_drach == 0])
print('Number of DRACH motifs with no postive labels: ', num_zeros)
print('Proportion of DRACH motifs with no positive labels: ', num_zeros / len(prop_drach))

Number of DRACH motifs with no postive labels:  0
Proportion of DRACH motifs with no positive labels:  0.0


In [57]:
# Number of positive labels by middle_sequence
train_df[train_df['label'] == 1].groupby('middle_sequence')['label'].count().sort_values()


middle_sequence
TAACA       1
TAACC       2
AAACC      10
AAACA      32
AGACC      51
TAACT      53
TGACA      59
TGACC      79
GAACC      95
AGACA     111
GAACA     136
AAACT     209
TGACT     355
GGACC     417
AGACT     429
GGACA     550
GAACT     653
GGACT    1146
Name: label, dtype: int64

Apply SMOTE to each group of DRACH motif

In [58]:
from imblearn.over_sampling import SMOTE

# Get the scaled mean reads and one-hot encoded middle sequence columns as dataframe
X_resampled_list = [] 
y_resampled_list = []
middle_sequence_list = []

for middle_seq, group in train_df.groupby('middle_sequence'):
    X = np.vstack(group['scaled_mean_reads'].values)
    y = group['label'].values
    
    if sum(y) < 10: # Skip if the number of positive labels is less than 10
        X_resampled_list.append(X)
        y_resampled_list.append(y)
        middle_sequence_list.extend([middle_seq] * len(y))
        continue

    # Apply SMOTE to the scaled mean reads and labels
    smote = SMOTE(random_state=4262)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Append resampled data and one-hot encoding to lists
    X_resampled_list.append(X_resampled)
    y_resampled_list.append(y_resampled)
    middle_sequence_list.extend([middle_seq] * len(y_resampled))

# Combine all resampled data into final arrays
X_resampled = np.vstack(X_resampled_list)
y_resampled = np.concatenate(y_resampled_list)

# Combine into a final DataFrame if needed
resampled_df = pd.DataFrame({
    'scaled_mean_reads': list(X_resampled),
    'middle_sequence': middle_sequence_list,
    'label': y_resampled
})

In [19]:
# Proportion of positive labels in the resampled data
print('Proportion of positive labels in the resampled data: ', np.mean(resampled_df['label']))

# Proportion of positive labels by middle_sequence
prop_drach_resampled = resampled_df.groupby('middle_sequence')['label'].mean()
prop_drach_resampled

Proportion of positive labels in the resampled data:  0.483202765619337


middle_sequence
AAACA    0.500000
AAACC    0.500000
AAACT    0.500000
AGACA    0.500000
AGACC    0.500000
AGACT    0.500000
GAACA    0.500000
GAACC    0.500000
GAACT    0.500000
GGACA    0.500000
GGACC    0.500000
GGACT    0.500000
TAACA    0.000271
TAACC    0.000856
TAACT    0.500000
TGACA    0.500000
TGACC    0.500000
TGACT    0.500000
Name: label, dtype: float64

One hot encode DRACH motifs

In [20]:
resampled_df = one_hot_encode_DRACH(resampled_df)
resampled_df.head(3)

DRACH Encoder saved to drach_encoder.pkl


Unnamed: 0,scaled_mean_reads,middle_sequence,label,middle_sequence_OHE
0,"[2.5249757619759317, -0.7835678231993425, -0.2...",AAACA,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[0.6928199968356539, -1.1676038905620205, -0.7...",AAACA,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[0.3732826597732115, -0.8372266235169966, -0.2...",AAACA,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Prepare for model input

In [21]:
# With SMOTE
X_resampled = prepare_for_model(resampled_df)
y_resampled = resampled_df['label'].values

# Without SMOTE
X_train = prepare_for_model(train_df)
y_train = train_df['label'].values

# Prepare test data
X_test = prepare_for_model(test_df)
y_test = test_df['label'].values

### NN model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import AUC

def build_model(input_shape):
    """
    Define and compile neural network model.
    """
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(150, activation='relu'),
        Dropout(0.2),  # Dropout layer for regularization
        Dense(32, activation='relu'),
        Dropout(0.2),  # Another dropout layer
        Dense(1, activation='sigmoid')
    ])
    # Set AUC with Precision-Recall (PR) curve
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(curve='PR', name='auc_pr')])
    return model

#### Train model with and without SMOTE

In [None]:
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import ModelCheckpoint

# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=4262)
best_val_auc_pr_smote = 0
best_val_auc_pr_no_smote = 0
best_model_path_smote = 'best_model_with_smote.keras'
best_model_path_no_smote = 'best_model_without_smote.keras'

# Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_resampled)):
    print(f"Training fold {fold + 1} with SMOTE")

    # Split the data with SMOTE for this fold
    X_train_smote, X_val_smote = X_resampled[train_idx], X_resampled[val_idx]
    y_train_smote, y_val_smote = y_resampled[train_idx], y_resampled[val_idx]

    # Initialize and compile the model
    model_smote = build_model(X_train_smote.shape[1])

    # Set up the checkpoint to monitor AUC-PR and save the best model
    checkpoint_smote = ModelCheckpoint(
        best_model_path_smote,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

    # Train the model on this fold with SMOTE data
    history_smote = model_smote.fit(
        X_train_smote, y_train_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_smote, y_val_smote),
        callbacks=[checkpoint_smote]
    )

    # Track the best validation AUC-PR across folds with SMOTE
    fold_best_auc_pr_smote = max(history_smote.history['val_auc_pr'])
    if fold_best_auc_pr_smote > best_val_auc_pr_smote:
        best_val_auc_pr_smote = fold_best_auc_pr_smote

# Cross-validation without SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1} without SMOTE")

    # Split the data without SMOTE for this fold
    X_train_no_smote, X_val_no_smote = X_train[train_idx], X_train[val_idx]
    y_train_no_smote, y_val_no_smote = y_train[train_idx], y_train[val_idx]

    # Initialize and compile the model
    model_no_smote = build_model(X_train_no_smote.shape[1])

    # Set up the checkpoint to monitor AUC-PR and save the best model
    checkpoint_no_smote = ModelCheckpoint(
        best_model_path_no_smote,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

    # Train the model on this fold without SMOTE data
    history_no_smote = model_no_smote.fit(
        X_train_no_smote, y_train_no_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_no_smote, y_val_no_smote),
        callbacks=[checkpoint_no_smote]
    )

    # Track the best validation AUC-PR across folds without SMOTE
    fold_best_auc_pr_no_smote = max(history_no_smote.history['val_auc_pr'])
    if fold_best_auc_pr_no_smote > best_val_auc_pr_no_smote:
        best_val_auc_pr_no_smote = fold_best_auc_pr_no_smote

Training fold 1 with SMOTE
Epoch 1/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - auc_pr: 0.7317 - loss: 0.5755 - val_auc_pr: 0.8537 - val_loss: 0.4561
Epoch 2/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - auc_pr: 0.8477 - loss: 0.4593 - val_auc_pr: 0.8810 - val_loss: 0.4146
Epoch 3/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - auc_pr: 0.8703 - loss: 0.4248 - val_auc_pr: 0.8953 - val_loss: 0.3867
Epoch 4/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - auc_pr: 0.8826 - loss: 0.4059 - val_auc_pr: 0.9083 - val_loss: 0.3652
Epoch 5/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - auc_pr: 0.8925 - loss: 0.3866 - val_auc_pr: 0.9169 - val_loss: 0.3488
Training fold 2 with SMOTE
Epoch 1/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - auc_pr: 0.7258 - loss: 0.5774 - val_auc_pr: 0.8540 - val

Get the metrics

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_recall_curve
from tensorflow.keras.models import load_model
import numpy as np

# Load the best model for both SMOTE and no-SMOTE versions
best_model_smote = load_model(best_model_path_smote)
best_model_no_smote = load_model(best_model_path_no_smote)

threshold = 0.9
# Generate predictions and probabilities for both models on the test data
y_pred_smote = (best_model_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_smote = best_model_smote.predict(X_test).flatten()  # Probabilities

y_pred_no_smote = (best_model_no_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_no_smote = best_model_no_smote.predict(X_test).flatten()  # Probabilities

# Calculate metrics for SMOTE model
roc_auc_smote = roc_auc_score(y_test, y_proba_smote)
pr_auc_smote = average_precision_score(y_test, y_proba_smote)  # AUC-PR
accuracy_smote = accuracy_score(y_test, y_pred_smote)
average_precision_smote = average_precision_score(y_test, y_proba_smote)

# Calculate metrics for No-SMOTE model
roc_auc_no_smote = roc_auc_score(y_test, y_proba_no_smote)
pr_auc_no_smote = average_precision_score(y_test, y_proba_no_smote)  # AUC-PR
accuracy_no_smote = accuracy_score(y_test, y_pred_no_smote)
average_precision_no_smote = average_precision_score(y_test, y_proba_no_smote)

# Print comparison results
print("Metrics Comparison on Test Data:")
print("\nWith SMOTE:")
print(f"AUC-ROC: {roc_auc_smote:.4f}")
print(f"AUC-PR: {pr_auc_smote:.4f}")
print(f"Accuracy: {accuracy_smote:.4f}")
print(f"Average Precision Score: {average_precision_smote:.4f}")

print("\nWithout SMOTE:")
print(f"AUC-ROC: {roc_auc_no_smote:.4f}")
print(f"AUC-PR: {pr_auc_no_smote:.4f}")
print(f"Accuracy: {accuracy_no_smote:.4f}")
print(f"Average Precision Score: {average_precision_no_smote:.4f}")

[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
Metrics Comparison on Test Data:

With SMOTE:
AUC-ROC: 0.8507
AUC-PR: 0.3312
Accuracy: 0.9564
Average Precision Score: 0.3312

Without SMOTE:
AUC-ROC: 0.8907
AUC-PR: 0.4660
Accuracy: 0.9561
Average Precision Score: 0.4660


### NN model Regularisers

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import AUC
from tensorflow.keras.regularizers import l1, l2

def build_model(input_shape):
    """
    Define and compile neural network model.
    """
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(150, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.2),  # Dropout layer for regularization
        Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.2),  # Another dropout layer
        Dense(1, activation='sigmoid')
    ])
    # Set AUC with Precision-Recall (PR) curve
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(curve='PR', name='auc_pr')])
    return model


#### Train model with L2 regulariser

In [39]:
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import ModelCheckpoint

# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=4262)
best_val_auc_pr_smote = 0
best_val_auc_pr_no_smote = 0
best_model_path_smote_L2 = 'best_model_with_smote_L2.keras'
best_model_path_no_smote_L2 = 'best_model_without_smote_L2.keras'

# Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_resampled)):
    print(f"Training fold {fold + 1} with SMOTE + L2")

    # Split the data with SMOTE for this fold
    X_train_smote, X_val_smote = X_resampled[train_idx], X_resampled[val_idx]
    y_train_smote, y_val_smote = y_resampled[train_idx], y_resampled[val_idx]

    # Initialize and compile the model
    model_smote = build_model(X_train_smote.shape[1])

    # Set up the checkpoint to monitor AUC-PR and save the best model
    checkpoint_smote = ModelCheckpoint(
        best_model_path_smote,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

    # Train the model on this fold with SMOTE data
    history_smote = model_smote.fit(
        X_train_smote, y_train_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_smote, y_val_smote),
        callbacks=[checkpoint_smote]
    )

    # Track the best validation AUC-PR across folds with SMOTE
    fold_best_auc_pr_smote = max(history_smote.history['val_auc_pr'])
    if fold_best_auc_pr_smote > best_val_auc_pr_smote:
        best_val_auc_pr_smote = fold_best_auc_pr_smote

# Cross-validation without SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1} without SMOTE + L2")

    # Split the data without SMOTE for this fold
    X_train_no_smote, X_val_no_smote = X_train[train_idx], X_train[val_idx]
    y_train_no_smote, y_val_no_smote = y_train[train_idx], y_train[val_idx]

    # Initialize and compile the model
    model_no_smote = build_model(X_train_no_smote.shape[1])

    # Set up the checkpoint to monitor AUC-PR and save the best model
    checkpoint_no_smote = ModelCheckpoint(
        best_model_path_no_smote,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

    # Train the model on this fold without SMOTE data
    history_no_smote = model_no_smote.fit(
        X_train_no_smote, y_train_no_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_no_smote, y_val_no_smote),
        callbacks=[checkpoint_no_smote]
    )

    # Track the best validation AUC-PR across folds without SMOTE
    fold_best_auc_pr_no_smote = max(history_no_smote.history['val_auc_pr'])
    if fold_best_auc_pr_no_smote > best_val_auc_pr_no_smote:
        best_val_auc_pr_no_smote = fold_best_auc_pr_no_smote

Training fold 1 with SMOTE + L2
Epoch 1/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - auc_pr: 0.6786 - loss: 0.7453 - val_auc_pr: 0.7800 - val_loss: 0.5972
Epoch 2/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.7715 - loss: 0.5990 - val_auc_pr: 0.7986 - val_loss: 0.5802
Epoch 3/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.7904 - loss: 0.5812 - val_auc_pr: 0.8123 - val_loss: 0.5604
Epoch 4/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.7994 - loss: 0.5677 - val_auc_pr: 0.8213 - val_loss: 0.5487
Epoch 5/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - auc_pr: 0.8062 - loss: 0.5596 - val_auc_pr: 0.8315 - val_loss: 0.5467
Training fold 2 with SMOTE + L2
Epoch 1/5
[1m4484/4484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - auc_pr: 0.6853 - loss: 0.7513 - val_auc_pr: 0.7814

Get the metrics

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_recall_curve
from tensorflow.keras.models import load_model
import numpy as np

# Load the best model for both SMOTE and no-SMOTE versions
best_model_smote = load_model("best_model_with_smote.keras")
best_model_no_smote = load_model("best_model_without_smote.keras")

threshold = 0.9
# Generate predictions and probabilities for both models on the test data
y_pred_smote = (best_model_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_smote = best_model_smote.predict(X_test).flatten()  # Probabilities

y_pred_no_smote = (best_model_no_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_no_smote = best_model_no_smote.predict(X_test).flatten()  # Probabilities

# Calculate metrics for SMOTE model
roc_auc_smote = roc_auc_score(y_test, y_proba_smote)
pr_auc_smote = average_precision_score(y_test, y_proba_smote)  # AUC-PR
accuracy_smote = accuracy_score(y_test, y_pred_smote)
average_precision_smote = average_precision_score(y_test, y_proba_smote)

# Calculate metrics for No-SMOTE model
roc_auc_no_smote = roc_auc_score(y_test, y_proba_no_smote)
pr_auc_no_smote = average_precision_score(y_test, y_proba_no_smote)  # AUC-PR
accuracy_no_smote = accuracy_score(y_test, y_pred_no_smote)
average_precision_no_smote = average_precision_score(y_test, y_proba_no_smote)

# Print comparison results
print("Metrics Comparison on Test Data:")
print("\nWith SMOTE + L2:")
print(f"AUC-ROC: {roc_auc_smote:.4f}")
print(f"AUC-PR: {pr_auc_smote:.4f}")
print(f"Accuracy: {accuracy_smote:.4f}")
print(f"Average Precision Score: {average_precision_smote:.4f}")

print("\nWithout SMOTE + L2:")
print(f"AUC-ROC: {roc_auc_no_smote:.4f}")
print(f"AUC-PR: {pr_auc_no_smote:.4f}")
print(f"Accuracy: {accuracy_no_smote:.4f}")
print(f"Average Precision Score: {average_precision_no_smote:.4f}")

[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Metrics Comparison on Test Data:

With SMOTE + Focal Loss:
AUC-ROC: 0.8080
AUC-PR: 0.2868
Accuracy: 0.9566
Average Precision Score: 0.2868

Without SMOTE + Focal Loss:
AUC-ROC: 0.8543
AUC-PR: 0.3406
Accuracy: 0.9561
Average Precision Score: 0.3406
