### Helper Functions For Preprocessing Data

In [1]:
from helper_functions import *

### Preprocessing

Load Data

In [2]:
labels = pd.read_csv('../data/data.info.labelled')
labels.head(3)

Unnamed: 0,gene_id,transcript_id,transcript_position,label
0,ENSG00000004059,ENST00000000233,244,0
1,ENSG00000004059,ENST00000000233,261,0
2,ENSG00000004059,ENST00000000233,316,0


In [3]:
df = load_data_to_dataframe('../data/dataset0.json.gz')
df.head(3)

Unnamed: 0,transcript_id,position,sequence,reads
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0..."
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0...."
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0..."


Assign labels to the data

In [None]:
df = combine_data(df, labels)
df.head(3)

Train test split by gene_id

In [13]:
# Train test split by gene id
from sklearn.model_selection import train_test_split
train_gene_ids, test_gene_ids = train_test_split(df['gene_id'].unique(), test_size=0.2, random_state=4262)

train_df = df[df['gene_id'].isin(train_gene_ids)].copy()
test_df = df[df['gene_id'].isin(test_gene_ids)].copy()

display(train_df.head(3), train_df.shape)
display(test_df.head(3), test_df.shape)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label
18,ENSG00000003056,ENST00000000412,355,GAAACTA,"[[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0...",0
19,ENSG00000003056,ENST00000000412,367,GGGACCG,"[[0.00232, 1.32, 117.0, 0.0073, 7.89, 120.0, 0...",0
20,ENSG00000003056,ENST00000000412,496,AGGACTG,"[[0.00398, 2.46, 111.0, 0.016, 3.36, 125.0, 0....",0


(97073, 6)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0


(24765, 6)

Extract mean reads for test data and training data where label == 0, else flatten.

In [22]:
lab1_df = train_df[train_df['label'] == 1].copy()
lab1_df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label
52,ENSG00000003056,ENST00000000412,2440,TGGACTG,"[[0.00531, 4.27, 119.0, 0.00764, 9.57, 114.0, ...",1
53,ENSG00000003056,ENST00000000412,2462,TGAACCC,"[[0.00777, 6.39, 123.0, 0.0154, 2.67, 96.8, 0....",1
55,ENSG00000003056,ENST00000000412,2499,TGGACAC,"[[0.00736, 3.78, 119.0, 0.00564, 15.3, 107.0, ...",1


In [None]:
def flatten_reads(row):
    gene_id = row['gene_id']
    transcript_id = row['transcript_id']
    position = row['position']
    sequence = row['sequence']
    reads = row['reads']
    label = row['label']
    return pd.DataFrame({'gene_id': [gene_id] * len(reads),
                         'transcript_id': [transcript_id] * len(reads),
                         'position': [position] * len(reads),
                         'sequence': [sequence] * len(reads),
                         'reads': reads,
                         'mean_reads': reads, 
                         'label': [label] * len(reads)})

# Test on first row
flatten_reads(lab1_df.iloc[0]).head(5)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,mean_reads,label
0,ENSG00000003056,ENST00000000412,2440,TGGACTG,"[0.00531, 4.27, 119.0, 0.00764, 9.57, 114.0, 0...","[0.00531, 4.27, 119.0, 0.00764, 9.57, 114.0, 0...",1
1,ENSG00000003056,ENST00000000412,2440,TGGACTG,"[0.00598, 2.26, 115.0, 0.00299, 1.37, 119.0, 0...","[0.00598, 2.26, 115.0, 0.00299, 1.37, 119.0, 0...",1
2,ENSG00000003056,ENST00000000412,2440,TGGACTG,"[0.00564, 2.8, 112.0, 0.00697, 4.08, 118.0, 0....","[0.00564, 2.8, 112.0, 0.00697, 4.08, 118.0, 0....",1
3,ENSG00000003056,ENST00000000412,2440,TGGACTG,"[0.0093, 2.02, 116.0, 0.011, 5.8, 116.0, 0.007...","[0.0093, 2.02, 116.0, 0.011, 5.8, 116.0, 0.007...",1
4,ENSG00000003056,ENST00000000412,2440,TGGACTG,"[0.00166, 5.25, 116.0, 0.0183, 3.1, 121.0, 0.0...","[0.00166, 5.25, 116.0, 0.0183, 3.1, 121.0, 0.0...",1


In [35]:
# flatten lab1_df
lab1_df = lab1_df.apply(flatten_reads, axis=1)
lab1_df = pd.concat(lab1_df.values, ignore_index=True)
print(lab1_df.shape)
lab1_df.head(3)

(405306, 7)


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,mean_reads,label
0,ENSG00000003056,ENST00000000412,2440,TGGACTG,"[0.00531, 4.27, 119.0, 0.00764, 9.57, 114.0, 0...","[0.00531, 4.27, 119.0, 0.00764, 9.57, 114.0, 0...",1
1,ENSG00000003056,ENST00000000412,2440,TGGACTG,"[0.00598, 2.26, 115.0, 0.00299, 1.37, 119.0, 0...","[0.00598, 2.26, 115.0, 0.00299, 1.37, 119.0, 0...",1
2,ENSG00000003056,ENST00000000412,2440,TGGACTG,"[0.00564, 2.8, 112.0, 0.00697, 4.08, 118.0, 0....","[0.00564, 2.8, 112.0, 0.00697, 4.08, 118.0, 0....",1


In [73]:
lab0_df = train_df[train_df['label'] == 0].copy()
# Extract mean reads
lab0_df = extract_mean_reads(lab0_df)

# Flatten lab0_df
# lab0_df = lab0_df.apply(flatten_reads, axis=1)
# lab0_df = pd.concat(lab0_df.values, ignore_index=True)
print(lab0_df.shape)
lab0_df.head(3)

(8290665, 7)


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,mean_reads,label
0,ENSG00000003056,ENST00000000412,355,GAAACTA,"[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0....",35.772351,0
1,ENSG00000003056,ENST00000000412,355,GAAACTA,"[0.00365, 1.25, 109.0, 0.00626, 3.18, 107.0, 0...",35.221712,0
2,ENSG00000003056,ENST00000000412,355,GAAACTA,"[0.0091, 1.93, 110.0, 0.011, 2.52, 109.0, 0.00...",35.694862,0


In [None]:
# Combine lab0 and lab1
train_df = pd.concat([lab0_df, lab1_df], ignore_index=True)
train_df.head(3)

In [75]:
test_df = extract_mean_reads(test_df)
test_df.head(3)

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123...."
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109..."
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105...."


Scale mean reads

In [76]:
# Scale mean reads of train data first, then use the fitted scaler to scale test data
scaler, scaled_train_df = scale_mean_reads(train_df)
scaler, scaled_test_df = scale_mean_reads(test_df, scaler=scaler)

display(scaled_train_df.head(3))
display(scaled_test_df.head(3))

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1 and the array at index 8290665 has size 9

Extract middle sequence

In [None]:
scaled_train_df_w_middle_seq = extract_middle_sequence(scaled_train_df)
scaled_test_df_w_middle_seq = extract_middle_sequence(scaled_test_df)

display(scaled_train_df_w_middle_seq.head(3))
display(scaled_test_df_w_middle_seq.head(3))

Unnamed: 0,gene_id,transcript_id,position,sequence,reads,mean_reads,label,scaled_mean_reads,middle_sequence
0,ENSG00000003056,ENST00000000412,355,GAAACTA,"[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0....","[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0....",0,"[-1.0693612703704736, -0.7541725813545678, -0....",AAACT
1,ENSG00000003056,ENST00000000412,355,GAAACTA,"[0.00365, 1.25, 109.0, 0.00626, 3.18, 107.0, 0...","[0.00365, 1.25, 109.0, 0.00626, 3.18, 107.0, 0...",0,"[-0.823526526149904, -1.2046370087100522, -0.1...",AAACT
2,ENSG00000003056,ENST00000000412,355,GAAACTA,"[0.0091, 1.93, 110.0, 0.011, 2.52, 109.0, 0.00...","[0.0091, 1.93, 110.0, 0.011, 2.52, 109.0, 0.00...",0,"[0.18384141069378343, -0.9405716547430442, -0....",AAACT


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123....","[0.0293866530873719, -0.04982338005385481, 1.0...",AGACC
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109...","[-0.27654529489956364, -0.441012655550927, -0....",AAACT
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105....","[-0.0989609642366464, -0.5481470289550303, -0....",AAACA


One hot encode middle sequence (DRACH motif)

In [None]:
# Encode train data first, then use the fitted encoder to transform test data
encoder, ohe_train_df = one_hot_encode_DRACH(scaled_train_df_w_middle_seq)
encoder, ohe_test_df = one_hot_encode_DRACH(scaled_test_df_w_middle_seq, encoder=encoder)

display(ohe_train_df.head(3))
display(ohe_test_df.head(3))

DRACH Encoder saved to ../artifacts/drach_encoder.pkl


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,mean_reads,label,scaled_mean_reads,middle_sequence,middle_sequence_OHE
0,ENSG00000003056,ENST00000000412,355,GAAACTA,"[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0....","[0.00232, 2.41, 109.0, 0.0222, 2.85, 111.0, 0....",0,"[-1.0693612703704736, -0.7541725813545678, -0....",AAACT,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,ENSG00000003056,ENST00000000412,355,GAAACTA,"[0.00365, 1.25, 109.0, 0.00626, 3.18, 107.0, 0...","[0.00365, 1.25, 109.0, 0.00626, 3.18, 107.0, 0...",0,"[-0.823526526149904, -1.2046370087100522, -0.1...",AAACT,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,ENSG00000003056,ENST00000000412,355,GAAACTA,"[0.0091, 1.93, 110.0, 0.011, 2.52, 109.0, 0.00...","[0.0091, 1.93, 110.0, 0.011, 2.52, 109.0, 0.00...",0,"[0.18384141069378343, -0.9405716547430442, -0....",AAACT,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Unnamed: 0,gene_id,transcript_id,position,sequence,reads,label,mean_reads,scaled_mean_reads,middle_sequence,middle_sequence_OHE
0,ENSG00000004059,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",0,"[0.008264378378378385, 4.223783783783786, 123....","[0.0293866530873719, -0.04982338005385481, 1.0...",AGACC,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1,ENSG00000004059,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",0,"[0.006609244186046515, 3.2164244186046504, 109...","[-0.27654529489956364, -0.441012655550927, -0....",AAACT,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,ENSG00000004059,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",0,"[0.0075699999999999995, 2.94054054054054, 105....","[-0.0989609642366464, -0.5481470289550303, -0....",AAACA,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### SMOTE

Check label imbalance (FOR SMOTE)

In [None]:
# Proportion of positive labels in the training set
np.mean(ohe_train_df['label'])

0.04660848110004047

#### By GENE_ID

In [None]:
# Proportion of positive labels by gene_id
prop_gene = ohe_train_df.groupby('gene_id')['label'].mean()

# Number of zeros
num_zeros = len(prop_gene[prop_gene == 0])
print('Number of gene_id with no postive labels: ', num_zeros)
print('Proportion of gene_id with no positive labels: ', num_zeros / len(prop_gene))

Number of gene_id with no postive labels:  1885
Proportion of gene_id with no positive labels:  0.6118143459915611


In [None]:
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# Get the scaled mean reads and one-hot encoded middle sequence columns as dataframe
X_resampled_list = [] 
y_resampled_list = []
gene_id_list = []

for g_id, group in ohe_train_df.groupby('gene_id'):
    X = np.vstack(group['scaled_mean_reads'].values)
    y = group['label'].values
    
    if sum(y) < 10 or sum(y - 1) < 10:  # Skip resampling if the number of positive or negative labels is less than 10
        X_resampled_list.append(X)
        y_resampled_list.append(y)
        gene_id_list.extend([g_id] * len(y))  # Use g_id for the gene_id
        continue

    # Apply SMOTE to the scaled mean reads and labels
    smote = SMOTE(random_state=4262)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Append resampled data to lists
    X_resampled_list.append(X_resampled)
    y_resampled_list.append(y_resampled)
    gene_id_list.extend([g_id] * len(y_resampled))  # Use g_id for the gene_id

# Combine all resampled data into final arrays
X_resampled_gene = np.vstack(X_resampled_list)
y_resampled_gene = np.concatenate(y_resampled_list)

# Combine into a final DataFrame if needed
resampled_gene_df = pd.DataFrame({
    'scaled_mean_reads': list(X_resampled_gene),
    'gene_id': gene_id_list,
    'label': y_resampled_gene
})

In [None]:
# Proportion of positive labels in the resampled data
print('Proportion of positive labels in the resampled data:', np.mean(resampled_gene_df['label']))

# Proportion of positive labels by gene_id
prop_gene = resampled_gene_df.groupby('gene_id')['label'].mean()

# Number of zeros
num_zeros = len(prop_gene[prop_gene == 0])
print('Number of gene_id with no postive labels: ', num_zeros)
print('Proportion of gene_id with no positive labels: ', num_zeros / len(prop_gene))

Proportion of positive labels in the resampled data: 0.04660848110004047
Number of gene_id with no postive labels:  1885
Proportion of gene_id with no positive labels:  0.6118143459915611


#### By MIDDLE_SEQUENCE (DRACH)

In [None]:
# Proportion of positive labels by middle_sequence
prop_drach = ohe_train_df.groupby('middle_sequence')['label'].mean()

# Number of zeros
num_zeros = len(prop_drach[prop_drach == 0])
print('Number of DRACH motifs with no postive labels: ', num_zeros)
print('Proportion of DRACH motifs with no positive labels: ', num_zeros / len(prop_drach))

Number of DRACH motifs with no postive labels:  0
Proportion of DRACH motifs with no positive labels:  0.0


In [None]:
# Number of positive labels by middle_sequence
ohe_train_df[ohe_train_df['label'] == 1].groupby('middle_sequence')['label'].count().sort_values()

middle_sequence
TAACA       288
TAACC       499
AAACC       807
AGACC      4747
TAACT      5229
AAACA      5758
TGACA      8191
AGACA      9055
GAACC     11106
TGACC     11196
GAACA     11884
AAACT     19086
GGACC     32979
TGACT     34091
GGACA     41956
AGACT     42712
GAACT     54691
GGACT    111031
Name: label, dtype: int64

In [None]:
# Proportion of positive labels by middle_sequence
prop_drach.sort_values()

middle_sequence
TAACA    0.000906
AAACC    0.001746
TAACC    0.002274
AAACA    0.008102
AGACC    0.009139
TGACA    0.014103
TAACT    0.017617
AGACA    0.017665
TGACC    0.020061
GAACA    0.020153
GAACC    0.026848
AAACT    0.030940
GGACC    0.064225
TGACT    0.067482
GGACA    0.081284
AGACT    0.096119
GAACT    0.117386
GGACT    0.245299
Name: label, dtype: float64

Apply SMOTE to each group of DRACH motif

In [None]:
from imblearn.over_sampling import SMOTE

# Get the scaled mean reads and one-hot encoded middle sequence columns as dataframe
X_resampled_list = [] 
y_resampled_list = []
middle_sequence_list = []

for middle_seq, group in ohe_train_df.groupby('middle_sequence'):
    X = np.vstack(group['scaled_mean_reads'].values)
    y = group['label'].values
    
    if sum(y) < 10: # Skip if the number of positive labels is less than 10
        X_resampled_list.append(X)
        y_resampled_list.append(y)
        middle_sequence_list.extend([middle_seq] * len(y))
        continue

    # Apply SMOTE to the scaled mean reads and labels
    smote = SMOTE(random_state=4262)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Append resampled data and one-hot encoding to lists
    X_resampled_list.append(X_resampled)
    y_resampled_list.append(y_resampled)
    middle_sequence_list.extend([middle_seq] * len(y_resampled))

# Combine all resampled data into final arrays
X_resampled = np.vstack(X_resampled_list)
y_resampled = np.concatenate(y_resampled_list)

# Combine into a final DataFrame if needed
resampled_drach_df = pd.DataFrame({
    'scaled_mean_reads': list(X_resampled),
    'middle_sequence': middle_sequence_list,
    'label': y_resampled
})

In [77]:
# Proportion of positive labels in the resampled data
print('Proportion of positive labels in the resampled data: ', np.mean(resampled_drach_df['label']))

# Proportion of positive labels by middle_sequence
prop_drach_resampled = resampled_drach_df.groupby('middle_sequence')['label'].mean()
prop_drach_resampled

Proportion of positive labels in the resampled data:  0.5


middle_sequence
AAACA    0.5
AAACC    0.5
AAACT    0.5
AGACA    0.5
AGACC    0.5
AGACT    0.5
GAACA    0.5
GAACC    0.5
GAACT    0.5
GGACA    0.5
GGACC    0.5
GGACT    0.5
TAACA    0.5
TAACC    0.5
TAACT    0.5
TGACA    0.5
TGACC    0.5
TGACT    0.5
Name: label, dtype: float64

One hot encode DRACH motifs

In [78]:
encoder, resampled_drach_df = one_hot_encode_DRACH(resampled_drach_df, encoder)
resampled_drach_df.head(3)

MemoryError: Unable to allocate 2.22 GiB for an array with shape (16581330, 18) and data type float64

### Prepare data for model input

In [79]:
# SMOTE by middle_sequence
X_resampled_drach = prepare_for_model(resampled_drach_df)
y_resampled_drach = resampled_drach_df['label'].values

# Without SMOTE
X_train = prepare_for_model(ohe_train_df)
y_train = ohe_train_df['label'].values

# Prepare test data
X_test = prepare_for_model(ohe_test_df)
y_test = ohe_test_df['label'].values

MemoryError: 

In [69]:
# print the shapes
print('X_resampled_drach shape:', X_resampled_drach.shape)
print('y_resampled_drach shape:', y_resampled_drach.shape)
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_resampled_drach shape: (16581330, 27)
y_resampled_drach shape: (16581330,)
X_train shape: (8695971, 27)
y_train shape: (8695971,)
X_test shape: (24765, 27)
y_test shape: (24765,)


### NN model

In [70]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import AUC

def build_model(input_shape):
    """
    Define and compile neural network model.
    """
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(150, activation='relu'),
        Dropout(0.2),  # Dropout layer for regularization
        Dense(32, activation='relu'),
        Dropout(0.2),  # Another dropout layer
        Dense(1, activation='sigmoid')
    ])
    # Keep track of AUC-PR and AUC-ROC during training
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[
            AUC(curve='PR', name='auc_pr'),  # AUC-PR
            AUC(curve='ROC', name='auc_roc')  # AUC-ROC
        ]
    )
    return model

#### Train model with and without SMOTE

In [None]:
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import ModelCheckpoint

# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=4262)
best_val_auc_pr_smote = 0
best_val_auc_pr_no_smote = 0
BEST_MODEL_PATH_SMOTE = "../artifacts/best_model_with_smote.keras"
BEST_MODEL_PATH_NO_SMOTE = "../artifacts/best_model_without_smote.keras"

# Set checkpoint to save the best model (based on validation AUC-PR)
checkpoint_smote = ModelCheckpoint(
        BEST_MODEL_PATH_SMOTE,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

checkpoint_no_smote = ModelCheckpoint(
        BEST_MODEL_PATH_NO_SMOTE,
        save_best_only=True,
        monitor='val_auc_pr',
        mode='max'
    )

SMOTE model

In [72]:
# Cross-validation with SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_resampled_drach)):
    print(f"Training fold {fold + 1} with SMOTE")

    # Split the data with SMOTE for this fold
    X_train_smote, X_val_smote = X_resampled_drach[train_idx].copy(), X_resampled_drach[val_idx].copy()
    y_train_smote, y_val_smote = y_resampled_drach[train_idx].copy(), y_resampled_drach[val_idx].copy()

    # Initialize and compile the model
    model_smote = build_model(X_train_smote.shape[1])

    # Train the model on this fold with SMOTE data
    history_smote = model_smote.fit(
        X_train_smote, y_train_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_smote, y_val_smote),
        callbacks=[checkpoint_smote]
    )

    # Track the best validation AUC-PR across folds with SMOTE
    fold_best_auc_pr_smote = max(history_smote.history['val_auc_pr'])
    if fold_best_auc_pr_smote > best_val_auc_pr_smote:
        best_val_auc_pr_smote = fold_best_auc_pr_smote

Training fold 1 with SMOTE


MemoryError: Unable to allocate 2.67 GiB for an array with shape (13265064, 27) and data type float64

No SMOTE Model

In [None]:
# Cross-validation without SMOTE
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Training fold {fold + 1} without SMOTE")

    # Split the data without SMOTE for this fold
    X_train_no_smote, X_val_no_smote = X_train[train_idx].copy(), X_train[val_idx].copy()
    y_train_no_smote, y_val_no_smote = y_train[train_idx].copy(), y_train[val_idx].copy()

    # Initialize and compile the model
    model_no_smote = build_model(X_train_no_smote.shape[1])

    # Train the model on this fold without SMOTE data
    history_no_smote = model_no_smote.fit(
        X_train_no_smote, y_train_no_smote,
        epochs=5,
        batch_size=32,
        validation_data=(X_val_no_smote, y_val_no_smote),
        callbacks=[checkpoint_no_smote]
    )

    # Track the best validation AUC-PR across folds without SMOTE
    fold_best_auc_pr_no_smote = max(history_no_smote.history['val_auc_pr'])
    if fold_best_auc_pr_no_smote > best_val_auc_pr_no_smote:
        best_val_auc_pr_no_smote = fold_best_auc_pr_no_smote

Training fold 1 without SMOTE
Epoch 1/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 839us/step - auc_pr: 0.1642 - auc_roc: 0.7672 - loss: 0.1731 - val_auc_pr: 0.3650 - val_auc_roc: 0.8621 - val_loss: 0.1364
Epoch 2/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 722us/step - auc_pr: 0.3477 - auc_roc: 0.8534 - loss: 0.1344 - val_auc_pr: 0.4009 - val_auc_roc: 0.8751 - val_loss: 0.1281
Epoch 3/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 732us/step - auc_pr: 0.3889 - auc_roc: 0.8633 - loss: 0.1291 - val_auc_pr: 0.4176 - val_auc_roc: 0.8788 - val_loss: 0.1287
Epoch 4/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 783us/step - auc_pr: 0.4021 - auc_roc: 0.8667 - loss: 0.1330 - val_auc_pr: 0.4190 - val_auc_roc: 0.8826 - val_loss: 0.1249
Epoch 5/5
[1m2427/2427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 810us/step - auc_pr: 0.4155 - auc_roc: 0.8730 - loss: 0.1289 - val_auc_pr: 0.4249 - val_

Get the metrics

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_curve, auc, average_precision_score
from tensorflow.keras.models import load_model

# Load the best model for both SMOTE and no-SMOTE versions
best_model_smote = load_model(BEST_MODEL_PATH_SMOTE)
best_model_no_smote = load_model(BEST_MODEL_PATH_NO_SMOTE)

# Generate predictions and probabilities for both models on the test data
threshold = 0.9 # Threshold for binary predictions

y_pred_smote = (best_model_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_smote = best_model_smote.predict(X_test).flatten()  # Probabilities

y_pred_no_smote = (best_model_no_smote.predict(X_test) > threshold).astype("int32").flatten()  # Binary predictions
y_proba_no_smote = best_model_no_smote.predict(X_test).flatten()  # Probabilities

# Calculate metrics for SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_smote)
roc_auc_smote = roc_auc_score(y_test, y_proba_smote)
pr_auc_smote = auc(recall, precision)
accuracy_smote = accuracy_score(y_test, y_pred_smote)
average_precision_smote = average_precision_score(y_test, y_proba_smote)

# Calculate metrics for No-SMOTE model
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_no_smote)
roc_auc_no_smote = roc_auc_score(y_test, y_proba_no_smote)
pr_auc_no_smote = auc(recall, precision)
accuracy_no_smote = accuracy_score(y_test, y_pred_no_smote)
average_precision_no_smote = average_precision_score(y_test, y_proba_no_smote)

# Print comparison results
print("Metrics Comparison on Test Data:")
print("\nWith SMOTE:")
print(f"AUC-ROC: {roc_auc_smote:.4f}")
print(f"AUC-PR: {pr_auc_smote:.4f}")
print(f"Accuracy: {accuracy_smote:.4f}")
print(f"Average Precision Score: {average_precision_smote:.4f}")

print("\nWithout SMOTE:")
print(f"AUC-ROC: {roc_auc_no_smote:.4f}")
print(f"AUC-PR: {pr_auc_no_smote:.4f}")
print(f"Accuracy: {accuracy_no_smote:.4f}")
print(f"Average Precision Score: {average_precision_no_smote:.4f}")

[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 350us/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334us/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356us/step
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336us/step
Metrics Comparison on Test Data:

With SMOTE:
AUC-ROC: 0.8528
AUC-PR: 0.3488
Accuracy: 0.9580
Average Precision Score: 0.3493

Without SMOTE:
AUC-ROC: 0.8949
AUC-PR: 0.4714
Accuracy: 0.9561
Average Precision Score: 0.4718


### Improving Model 

SMOTE seems to be overfitting the data: try regularisers