# The data

## Loading the data

In [492]:
import pandas as pd

data = pd.read_csv("data/in-vehicle-coupon-recommendation.csv")


## Pre-processing

As a very first step we are going to drop 2 of the features from our dataset, one of them being the feature car, because our dataset includes values of this feature for only 109 records, and the second one will be direction_opp, because it's the complete opposite of direction_same and together they would be redundant. 

In [493]:
data = data.drop(['car','direction_opp'], axis=1)

### Feature Engineering

One of our features has mixed values. That is the feature age and the possible values are the following: "21, 46, 26, 31, 41, 50plus, 36, below21". For that reason we will try to engineer new columns based on this one, we will try 2 options: converting it to numeric values and categorical and we'll see which one does better for our models. 

In [494]:

def convert_age_categorical(value):
    try:
        value = int(value)
        if value < 21:
            return "<21"
        elif value <= 30:
            return "21-30"
        elif value <= 40:
            return "31-40"
        elif value <= 50:
            return "41-50"
        else:
            return "51+"
    except:
        if str(value).lower() == "below21":
            return "<21"
        elif str(value).lower() == "50plus":
            return "51+"
        else:
            return "Unknown"
        

def convert_age_numeric(value):
    try:
        return int(value)
    except:
        mapping = {
            "below21": 20,
            "50plus": 55
        }
        return mapping.get(value.strip(), None)


data['age_numeric'] = data['age'].apply(convert_age_numeric)
data['age_group'] = data['age'].apply(convert_age_categorical)


We will try a similar engineering with income featue, which has the following values: "$37500 - $49999, $62500 - $74999, $12500 - $24999, $75000 - $87499, $50000 - $62499, $25000 - $37499, $100000 or More, $87500 - $99999, Less than $12500". 
As we can see there is obviously an ordered numeric meaning behind it, so we will try 2 way again: categorical and numeric.

We are going to drop former age and income columns after engineering new ones.

In [495]:
def convert_income_numeric(value):
    value = value.strip()

    if value == "Less than $12500":
        return 6250
    elif value == "$12500 - $24999":
        return (12500 + 24999) / 2
    elif value == "$25000 - $37499":
        return (25000 + 37499) / 2
    elif value == "$37500 - $49999":
        return (37500 + 49999) / 2
    elif value == "$50000 - $62499":
        return (50000 + 62499) / 2
    elif value == "$62500 - $74999":
        return (62500 + 74999) / 2
    elif value == "$75000 - $87499":
        return (75000 + 87499) / 2
    elif value == "$87500 - $99999":
        return (87500 + 99999) / 2
    elif value == "$100000 or More":
        return 110000
    else:
        return None
    

def convert_income_categorical(value):
    mapping = {
        "Less than $12500": "Under 12.5k",
        "$12500 - $24999": "12.5k-25k",
        "$25000 - $37499": "25k-37k",
        "$37500 - $49999": "37k-49k",
        "$50000 - $62499": "50k-62k",
        "$62500 - $74999": "62k-74k",
        "$75000 - $87499": "75k-87k",
        "$87500 - $99999": "87k-99k",
        "$100000 or More": "100k+"
    }
    return mapping.get(value.strip(), None)

data['income_numeric'] = data['income'].apply(convert_income_numeric)
data['income_group'] = data['income'].apply(convert_income_categorical)
data = data.drop(['age', 'income'], axis=1)

After doing these steps on the whole dataset, which is ok because these are just rule based modifications so there's no risk of data leakage, we will split the dataset into train and test sets and do our further modifications in a pipeline to avoid data leakage.

In [496]:
from sklearn.model_selection import train_test_split

seed = 7
X = data.drop('Y', axis=1)
y = data['Y']

# Split for final training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)



### Cyclical encoding

For the time feature, we will use cyclical encoding because it has a circular structure.

In [497]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer

def custom_cyclical_encoder_func(X):
    X = X.copy()

    # Convert time (e.g., "2PM") to hour
    X['hour'] = pd.to_datetime(X['time'], format='%I%p').dt.hour

    # Add sine and cosine transformations
    X['hour_sin'] = np.sin(2 * np.pi * X['hour'] / 24)
    X['hour_cos'] = np.cos(2 * np.pi * X['hour'] / 24)

    # Drop original columns
    X.drop(columns=['time', 'hour'], inplace=True)

    return X

custom_cyclical_encoder = FunctionTransformer(custom_cyclical_encoder_func)

# transform X_train for further skew checks
X_train_transformed = custom_cyclical_encoder.transform(X_train)



### Bias, Skew, Correlation

In [None]:
# # Skew
print("Skew")
skew_features = ['temperature', 'age_numeric', 'income_numeric']
print(X_train_transformed[skew_features].skew())


# # Correlation
print("Correlation")
corr_features = ['temperature', 'income_numeric', 'age_numeric', 'hour_sin', 'hour_cos']
corr_matrix = X_train_transformed[corr_features].corr()

print(corr_matrix)


Modifications on features based on skew and correlation. 

We have moderate negative (left skew) for temperature and moderate positive (right skew) for age_numeric, so what we'll do is use Box-Cox transformer, which automaticall handles both left and right skewness of the data, and works only for positive values, which is the case for both of our features. 

And based on the correlation matrix our features look good, no strong correlation between variables so we can keep all of them for now.


We will also use scaling on our numeric features.

In [499]:
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

skewed_features = ['temperature', 'age_numeric']

# only income_numeric, because the other 2 will already be scaled during box-cox transformation
numeric_features_to_scale = ['income_numeric']

box_cox_transformer = PowerTransformer(method='box-cox', standardize=True)
scaler = StandardScaler()


# this will be re-written at the end to contain the rest of the modifications for other features as well!!!
preprocessor = ColumnTransformer(transformers = [
    ('box_cox_transform', box_cox_transformer, skewed_features),
    ('scale_only', scaler, numeric_features_to_scale),
], remainder='passthrough')


Now let's look at the bias.

In [None]:
# Bias
bias_features = ['gender', 'maritalStatus', 'occupation', 'passanger', 'destination', 'coupon', 'weather', 'education', 'income_group', 'age_group', 'has_children', 'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same', 'expiration', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']
 
print("Bias")
for col in bias_features:
    # print(data[col].value_counts(normalize=True))

    print(f"\n--- Value Distribution for '{col}' ---")
    print((X_train_transformed[col].value_counts(normalize=True) * 100).round(2).rename("percentage").to_frame())
    # print(data[col].value_counts(normalize=True).rename("proportion").to_frame())



One of the features that is imbalanced is marital status. So what we'll do is join some of the groups into one for the less represented groups to have more statistical effect and to reduce the noise in the data, this will help to avoid overfitting in our models.

We can say the same about education and occupation and a few other features. So we will do the same re-grouping for them. In case of occupation, though we also have high cardinality, so it will help with this issue as well. 

We can also see that 2 of our features, weather and direction_same, we also have very high bias. But it was decided to keep these variables as they are because in our opinion they may hold important information and we don't want to lose it at early stages. 

Lastly, we will drop toCoupon_GEQ25min, because it has 88% 0s in it, and toCoupon_GEQ5min, because all the values here are the same, these features will be redundant in our analysis, and after all we also have toCoupon_GEQ15min, which is balanced and can give us an idea whether the driving distance effects the decision making or no.

In [501]:
def custom_balancing_func(X):
    X = X.copy()
    
    # Drop unnecessary columns
    X.drop(['toCoupon_GEQ25min', 'toCoupon_GEQ5min'], axis=1, inplace=True)

    # Mappings for regrouping
    replace_maps = {
        'maritalStatus': {
            "Divorced": "Previously Married",
            "Widowed": "Previously Married"
        },
        'education': {
            "Some High School": "High School or Less",
            "High School Graduate": "High School or Less",
            "Associates degree": "Some College",
            "Some college - no degree": "Some College"
        },
        'occupation': {
            "Architecture & Engineering": "Professional, Scientific & Technical",
            "Computer & Mathematical": "Professional, Scientific & Technical",
            "Legal": "Professional, Scientific & Technical",
            "Healthcare Support": "Healthcare",
            "Healthcare Practitioners & Technical": "Healthcare",
            "Management": "Management & Business",
            "Business & Financial": "Management & Business",
            "Sales & Related": "Admin/Sales",
            "Office & Administrative Support": "Admin/Sales",
            "Education&Training&Library": "Education",
            "Arts Design Entertainment Sports & Media": "Arts/Media",
            "Life Physical Social Science": "Social Work and Service", 
            "Community & Social Services": "Social Work and Service",
            "Personal Care & Service": "Social Work and Service",
            "Food Preparation & Serving Related": "Social Work and Service",
            "Protective Service": "Social Work and Service",
            "Building & Grounds Cleaning & Maintenance": "Social Work and Service",
            "Construction & Extraction": "Transportation/Manual",
            "Installation Maintenance & Repair": "Transportation/Manual",
            "Transportation & Material Moving": "Transportation/Manual",
            "Production Occupations": "Transportation/Manual",
            "Farming Fishing & Forestry": "Transportation/Manual",
            "Unemployed": "Student/Unemployed",
            "Student": "Student/Unemployed",
            "Retired": "Student/Unemployed"
        },
        'passanger': {
            "Kid(s)": "Kid(s) or Partner",
            "Partner": "Kid(s) or Partner"
        },
        'CarryAway': {
            "never": "less1"
        },
        'RestaurantLessThan20': {
            "never": "less1"
        },
        'Bar': {
            "4~8": "4+",
            "gt8": "4+"
        },
        'Restaurant20To50': {
            "4~8": "4+",
            "gt8": "4+"
        }
    }

    # Apply all mappings
    for col, mapping in replace_maps.items():
        if col in X.columns:
            X[col] = X[col].replace(mapping)

    return X


custom_balancing_function = FunctionTransformer(custom_balancing_func)


### Encoding ( One-hot and Ordinal )

For categorical data like destination, passanger, weather, coupon, gender, marital status, occupation we will use one-hot encoding, which is good for nominal data, when there is no order/ranking between the categories. 

In [502]:
from sklearn.preprocessing import OneHotEncoder



nominal_cat_features = ['destination', 'passanger', 'weather', 'coupon', 'gender', 'maritalStatus', 'occupation']

one_hot_encoder = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


For the rest of the categorical data we will use a different method of encoding because these are considered ordinal data, where categories have order/ranking. Some of these features also have missing values, so we will replace them with the most frequent value of the category.

We also give the order of the categories for the processor to know the correct order and give correct importance.


In [503]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

ordinal_cat_features = ['expiration', 'education', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50', 'age_group', 'income_group']

ordinal_categories = [
    ['2h', '1d'],   # expiration
    ['High School or Less', 'Some College', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'], # education
    ['never', 'less1', '1~3', '4+'],    # Bar
    ['never', 'less1', '1~3', '4~8', 'gt8'],    # CoffeeHouse
    ['less1', '1~3', '4~8', 'gt8'],    # CarryAway
    ['less1', '1~3', '4~8', 'gt8'],    # RestaurantLessThan20
    ['never', 'less1', '1~3', '4+'],     # Restaurant20To50
    ['<21', '21-30', '31-40', '41-50', '51+'], # age_group
    ['Under 12.5k', '12.5k-25k', '25k-37k', '37k-49k','50k-62k','62k-74k','75k-87k','87k-99k','100k+'] # income_group
]

ordinal_encoder = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories))
])


### Building the pipeline

In [504]:
def get_fetures_ready_for_training():

    data = pd.read_csv("data/in-vehicle-coupon-recommendation.csv")
    data = data.drop(['car','direction_opp'], axis=1)
    data['age_numeric'] = data['age'].apply(convert_age_numeric)
    data['age_group'] = data['age'].apply(convert_age_categorical)
    data['income_numeric'] = data['income'].apply(convert_income_numeric)
    data['income_group'] = data['income'].apply(convert_income_categorical)
    data = data.drop(['age', 'income'], axis=1)

    seed = 7
    X = data.drop('Y', axis=1)
    y = data['Y']

    # Split for final training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)


    # transformers
    custom_cyclical_encoder = FunctionTransformer(custom_cyclical_encoder_func)
    custom_balancing_function = FunctionTransformer(custom_balancing_func)
    box_cox_transformer = PowerTransformer(method='box-cox', standardize=True)
    scaler = StandardScaler()
    one_hot_encoder = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    ordinal_encoder = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(categories=ordinal_categories))
    ])


    # columns
    skewed_features = ['temperature', 'age_numeric']
    numeric_features_to_scale = ['income_numeric']
    nominal_cat_features = ['destination', 'passanger', 'weather', 'coupon', 'gender', 'maritalStatus', 'occupation']
    ordinal_cat_features = ['expiration', 'education', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50', 'age_group', 'income_group']


    # processors
    column_preprocessor = ColumnTransformer(transformers = [
        ('box_cox_transform', box_cox_transformer, skewed_features),
        ('num_scale', scaler, numeric_features_to_scale),
        ('nom', one_hot_encoder, nominal_cat_features),
        ('ord', ordinal_encoder, ordinal_cat_features)
    ], remainder='passthrough')

    preprocessor = Pipeline(steps=[
        ('cyclical', custom_cyclical_encoder),
        ('balancing', custom_balancing_function),
        ('column_transforms', column_preprocessor)
    ])



    preprocessor.fit(X_train)
    column_names = preprocessor.named_steps['column_transforms'].get_feature_names_out()
    X__train_processed = preprocessor.transform(X_train)
    X__test_processed = preprocessor.transform(X_test)

    X_train_transformed = pd.DataFrame(X__train_processed, columns=column_names)
    X_test_transformed = pd.DataFrame(X__test_processed, columns=column_names)

    
    return X_train_transformed, X_test_transformed, y_train, y_test, column_names

# The models

## Logistic regression

First model that we are going to try is Logistic regression.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



# Load and transform the data
X_train_transformed, X_test_transformed, y_train, y_test, column_names = get_fetures_ready_for_training()


# Init Logistic Regression model
log_reg = LogisticRegression(
    penalty='l2',           # regularization, shrinks coefficients evenly without eliminating some features entirely
    solver='liblinear',     # good for small to medium datasets (which ours is)
    random_state=42,        # common choise, makes sure our results are reproducable across different runs
    max_iter=1000           # default is 100, increased to make sure the model has enough room to train, especially for our 12k records
)

# Train the model
log_reg.fit(X_train_transformed, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test_transformed)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"\nLogistic Regression Validation Accuracy: {accuracy:.4f}")

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# finding the best parameters for the model with grid search

from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42),
                    param_grid, cv=5, scoring='accuracy')
grid.fit(X_train_transformed, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

In [None]:
# trying to make the performance of the model better by generating polynomial features
from sklearn.preprocessing import PolynomialFeatures


poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train_transformed)
X_test_poly = poly.transform(X_test_transformed)



# Initialize Logistic Regression model
log_reg = LogisticRegression(
    C=0.1,                  # controls the strength of regularisation, best suggested option by grid search
    penalty='l2',
    solver='liblinear',
    random_state=42,
    max_iter=1000
)

# Train the model
log_reg.fit(X_train_poly, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test_poly)

# Validation Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nLogistic Regression Validation Accuracy: {accuracy:.4f}")

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Both polynomial parameters and the suggested C=0.1 parameter increase the model performance. So we will keep this version as the final one for Logistic regression.

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report



# Load and transform the data
X_train_transformed, X_test_transformed, y_train, y_test, column_names = get_fetures_ready_for_training()


# Initialize Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

# Train the model
rf_model.fit(X_train_transformed, y_train)


# Predict on test set
y_pred = rf_model.predict(X_test_transformed)


# Validation Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nRandom Forest Validation Accuracy: {accuracy:.4f}")

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Tuning

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

# Parameter grid
param_dist = {
    'n_estimators': [100, 300, 500],        # number of trees
    'max_depth': [5, 10, 15, 20],           # max depth of each tree
    'min_samples_split': [2, 5, 10],        # min number of samples required to split the node 
    'min_samples_leaf': [1, 2, 5],          # min number of samples at a leaf node
    'max_features': ['sqrt', 'log2'],       # number of features to consider when looking for the best split
    'class_weight': ['balanced']            # adjusts the weights the way that the classes are balanced
}

# Init the model
rf = RandomForestClassifier(random_state=42)

# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Init RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=100,  # number of random combinations to try
    scoring='f1',
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model
random_search.fit(X_train_transformed, y_train)

# Best parameters and best CV score
print("Best Parameters:", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)

# Best estimator
best_rf = random_search.best_estimator_


# Predict on test set
y_pred = best_rf.predict(X_test_transformed)


# Validation Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nRandom Forest Validation Accuracy: {accuracy:.4f}")

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# training the model with the best suggested parameters

best_rf_tuned = RandomForestClassifier(
    n_estimators=300,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    max_depth=20,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Train the model
best_rf_tuned.fit(X_train_transformed, y_train)


# Predict on test set
y_pred = best_rf_tuned.predict(X_test_transformed)

# Validation Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nRandom Forest Validation Accuracy: {accuracy:.4f}")

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


We also tried to 1. remove the least imortant features, 2. select k top features with SelectKBest, 3. remove features with RFE ( recursive feature elimination ). But none of these imrpoved the model performance. So this last one will be left as final choise.

## FNN

In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



# Load and transform the data
X_train_transformed, X_test_transformed, y_train, y_test, column_names = get_fetures_ready_for_training()


# Init the FNN model
model_fnn = Sequential([
    Input(shape=(X_train_transformed.shape[1],)),

    Dense(64, activation='relu'),

    Dense(1, activation='sigmoid')
])


# Set parameters, compile
model_fnn.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy']
)


# Train the model
history_FNN = model_fnn.fit(
    X_train_transformed,
    y_train,
    epochs=150,
    validation_data=(X_test_transformed, y_test)
)


# Predict on test set (get probabilities)
y_pred_probs = model_fnn.predict(X_test_transformed)

# Convert the probabilities to labels (0 or 1)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()


# Validation Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nFNN Validation Accuracy: {accuracy:.4f}")

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import RMSprop, SGD


# Load and transform the data
X_train_transformed, X_test_transformed, y_train, y_test, column_names = get_fetures_ready_for_training()


# Build the model
def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(X_train_transformed.shape[1],)))

    for i in range(hp.Int("num_hidden_layers", 1, 5)):
        model.add(Dense(
            units=hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
            activation=hp.Choice("activation", ['relu', 'tanh', 'elu']),
            kernel_regularizer=tf.keras.regularizers.l2(
                hp.Float(f"l2_rate_{i}", 1e-5, 1e-2, sampling='log')
            )
        ))
        model.add(Dropout(hp.Float(f"dropout_{i}", 0.0, 0.6, step=0.1)))

    model.add(Dense(1, activation="sigmoid"))

    learning_rate = hp.Float("learning_rate", 1e-5, 1e-2, sampling="log")
    optimizer_choice = hp.Choice("optimizer", ["adam", "rmsprop", "sgd"])

    if optimizer_choice == "adam":
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_choice == "rmsprop":
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate, momentum=0.9)

    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model

tuner = kt.Hyperband(
    build_model,
    objective="val_accuracy",
    max_epochs=50,
    factor=3,
    directory="fnn_tuning",
    project_name="vehicle_coupon"
)

# EarlyStopping, to stop if no improvement found for 5 epochs
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Start the search
tuner.search(
    X_train_transformed, y_train,
    epochs=50,
    validation_data=(X_test_transformed, y_test),
    callbacks=[early_stopping]
)

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

In [None]:
# Get best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# # Print all hyperparameter values
# for param, value in best_hps.values.items():
#     print(f"{param}: {value}")


# Build model with best hyperparameters
best_model_fnn = tuner.hypermodel.build(best_hps)

# Train it
history_fnn_best = best_model_fnn.fit(X_train_transformed, y_train, validation_data=(X_test_transformed, y_test), epochs=150)


# Predict on test data (get probabilities)
y_pred_probs = best_model.predict(X_test_transformed)

# Convert the probabilities to labels (0 or 1)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()


# Validation Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nFNN Validation Accuracy: {accuracy:.4f}")

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
