In [1]:
import pandas as pd
import optuna

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    f1_score
)

from imblearn.over_sampling import SMOTE

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Load the dataset
df = pd.read_csv("datasets/3. online_shoppers_intention_clean.csv")

# 2. Select relevant columns
#df = df[[
#    "Administrative", "Informational", "ProductRelated",
#    "BounceRates", "ExitRates", "PageValues", "Month",
#    "TrafficType", "VisitorType", "Revenue"
#]]

# 3. Separate input features and target
X = df.drop(columns=["Revenue"])
y = df["Revenue"]

# 4. Split the data into train_val and test sets (80% train_val, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Split the train_val into training and validation sets (75% train, 25% val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)  # 0.25 x 0.8 = 0.2

# 6. Define numerical and categorical columns
numerical_features = [
    "Administrative",
    "Administrative_Duration",
    "Informational",
    "Informational_Duration",
    "ProductRelated",
    "ProductRelated_Duration",
    "BounceRates", 
    "ExitRates", 
    "PageValues",
    "SpecialDay"
]
categorical_features = [
    "Month",
    "OperatingSystems",
    "Browser",
    "Region",
    "TrafficType",
    "VisitorType",
    "Weekend"]

# 7. Preprocess numerical features
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train[numerical_features])

# Transform the numerical features
X_train_scaled = min_max_scaler.transform(X_train[numerical_features])
X_val_scaled = min_max_scaler.transform(X_val[numerical_features])
X_test_scaled = min_max_scaler.transform(X_test[numerical_features])

# Convert scaled arrays back to dataframes for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=numerical_features, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=numerical_features, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=numerical_features, index=X_test.index)

# 8. Preprocess categorical features
# Initialise the OneHotEncoder
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse_output=False turns the data into NumPy array

# Fit the encoder on the training data
one_hot_encoder.fit(X_train[categorical_features])

# Transform the categorical features
X_train_encoded = one_hot_encoder.transform(X_train[categorical_features])
X_val_encoded = one_hot_encoder.transform(X_val[categorical_features])
X_test_encoded = one_hot_encoder.transform(X_test[categorical_features])

# Get the new feature names after one-hot encoding
encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_features)

# Convert encoded arrays back to dataframes
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoded_feature_names, index=X_train.index)
X_val_encoded = pd.DataFrame(X_val_encoded, columns=encoded_feature_names, index=X_val.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoded_feature_names, index=X_test.index)

# 9. Combine processed features
X_train_processed = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_val_processed = pd.concat([X_val_scaled, X_val_encoded], axis=1)
X_test_processed = pd.concat([X_test_scaled, X_test_encoded], axis=1)

# Verify the shapes
print("Processed Training Data Shape:", X_train_processed.shape)
print("Processed Validation Data Shape:", X_val_processed.shape)
print("Processed Test Data Shape:", X_test_processed.shape)

# 10. Apply SMOTE to the training data
# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

# Verify the resampled data
# print("Resampled Training Data Shape:", X_train_resampled.shape)
# print("Resampled Training Target Distribution:\n", y_train_resampled.value_counts())

# 11. Reset indices
X_train_processed.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

X_val_processed.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

X_test_processed.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# 12. Final verification
print("Final Training Data Shape:", X_train_processed.shape)
print("Final Validation Data Shape:", X_val_processed.shape)
print("Final Test Data Shape:", X_test_processed.shape)

Processed Training Data Shape: (7334, 74)
Processed Validation Data Shape: (2445, 74)
Processed Test Data Shape: (2445, 74)
Final Training Data Shape: (7334, 74)
Final Validation Data Shape: (2445, 74)
Final Test Data Shape: (2445, 74)


# WARNING: DO NOT RUN THE CELL BELOW IF YOU DO NOT WANT TO RUN OPTIMISATION, AS IT MAY TAKE HOURS TO DAYS TO COMPLETE

In [None]:
# 13. Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    kernel = trial.suggest_categorical('kernel', ['rbf', 'linear', 'sigmoid'])
    C = trial.suggest_float('C', 1e-5, 1e5, log=True)
    
    if kernel != 'linear':
        gamma = trial.suggest_float('gamma', 1e-5, 1e5, log=True)
    else:
        gamma = 'scale'  # Default value when kernel is linear
    
    # Initialise the SVC model with hyperparameters
    svc = SVC(
        C=C,
        kernel=kernel,
        gamma=gamma,
        random_state=42,
    )
    
    # Train the model on the training data
    svc.fit(X_train_processed, y_train)
    
    # Make predictions on the validation set
    y_pred = svc.predict(X_val_processed)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    
    return accuracy  # Optuna will maximise accuracy

# 14. Set up the study
study_name = "svm_test_nosmote_nofeatsel"
storage_name = "sqlite:///optuna_study_svm_test_nosmote_nofeatsel.db"  # Use .db so that a record store of all trials will be saved

study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

# 15. Optimise the objective function with 1000 trials
study.optimize(objective, n_trials=1000, timeout=None)

# 16. Retrieve and display the best trial
best_trial = study.best_trial

print("Best Trial:")
print(f"  Value (Accuracy): {best_trial.value}")
print("  Params:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-09-23 20:12:56,319] A new study created in RDB with name: svm_test_nosmote_nofeatsel
[I 2024-09-23 20:13:00,884] Trial 0 finished with value: 0.8822085889570552 and parameters: {'kernel': 'linear', 'C': 9.695826644515218}. Best is trial 0 with value: 0.8822085889570552.
[I 2024-09-23 20:13:08,554] Trial 1 finished with value: 0.8085889570552147 and parameters: {'kernel': 'rbf', 'C': 4589.458612326462, 'gamma': 10.260065124896771}. Best is trial 0 with value: 0.8822085889570552.
[I 2024-09-23 20:13:14,000] Trial 2 finished with value: 0.8797546012269939 and parameters: {'kernel': 'sigmoid', 'C': 2110.6995036049584, 'gamma': 0.001328590390054419}. Best is trial 0 with value: 0.8822085889570552.
[I 2024-09-23 20:13:16,113] Trial 3 finished with value: 0.7464212678936605 and parameters: {'kernel': 'sigmoid', 'C': 1.7683340776662542, 'gamma': 0.20866527711063718}. Best is trial 0 with value: 0.8822085889570552.
[I 2024-09-23 20:13:17

Best Trial:
  Value (Accuracy): 0.8846625766871166
  Params:
    kernel: linear
    C: 94129.038884143


# Please continue running the cells from here on to skip optimisation and train with the best found hyperparameters

In [3]:
# 17. Train the final model on combined training and validation data with best hyperparameters

# Load the Optuna study to retrieve the best trial
study_name = "svm_test_nosmote_nofeatsel"
storage_name = "sqlite:///optuna_study_svm_test_nosmote_nofeatsel.db"

# Loading the existing study
study = optuna.load_study(
    study_name=study_name,
    storage=storage_name
)

# Retrieve the best trial
best_trial = study.best_trial

# Extract the best hyperparameters
best_params = best_trial.params

print("\nBest Hyperparameters from Optuna Study:")
for key, value in best_params.items():
    print(f"  {key}: {value}")



Best Hyperparameters from Optuna Study:
  kernel: linear
  C: 94129.038884143


In [4]:
# Combine training and validation data
X_combined = pd.concat([X_train_processed, X_val_processed], axis=0)
y_combined = pd.concat([y_train, y_val], axis=0)

# Apply SMOTE to the combined data
# X_combined_resampled, y_combined_resampled = smote.fit_resample(X_combined, y_combined)

# Initialise the best SVC model with optimal parameters
best_params = best_trial.params
kernel = best_params['kernel']
C = best_params['C']
gamma = best_params['gamma'] if kernel != 'linear' else 'scale'

best_svc = SVC(
    C=C,
    kernel=kernel,
    gamma=gamma,
    random_state=42,
)

# Train the final model
best_svc.fit(X_combined, y_combined)

# Make predictions on the combined training data
y_train_pred = best_svc.predict(X_combined)

# Calculate training accuracy & F1 score
training_accuracy = accuracy_score(y_combined, y_train_pred)
print(f"Training Accuracy: {training_accuracy:.2f}")
training_f1 = f1_score(y_combined, y_train_pred)
print(f"Training F1 Score: {training_f1:.2f}")

# Make predictions on the test data
y_test_pred = best_svc.predict(X_test_processed)

# Calculate test accuracy & F1 score
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.2f}")
test_f1 = f1_score(y_test, y_test_pred)
print(f"Test F1 Score: {test_f1:.2f}")

Training Accuracy: 0.88
Training F1 Score: 0.51
Test Accuracy: 0.89
Test F1 Score: 0.51


In [5]:
# 18. Check inference speed on test set
%timeit -r 10 -n 100 y_test_pred = best_svc.predict(X_test_processed)

74 ms ± 1.99 ms per loop (mean ± std. dev. of 10 runs, 100 loops each)
