In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd

In [None]:
air_canada = pd.read_csv('./new_branded_data.csv')
air_canada = air_canada.dropna(subset=['choice'])

y = air_canada['choice']
X = air_canada.drop(['id', 'Unnamed: 0','flight_departure_datetime', 'purchase_datetime', 'ticket_id', 'choice'], axis=1)
X = X.dropna()

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train (80%), validation (10%), and test (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = make_pipeline(
  StandardScaler()
)

cat_pipeline = make_pipeline(
  OneHotEncoder()
)

# Create the preprocessing transformer
preprocessing = ColumnTransformer([
    ('cat', cat_pipeline, ['od']),  # Only the 'od' column is categorical
    ('num', num_pipeline, make_column_selector(dtype_exclude='object'))  # Select all non-categorical columns
], remainder='drop')

In [None]:
X_train_prep = preprocessing.fit_transform(X_train)
X_val_prep = preprocessing.fit_transform(X_val)
X_test_prep = preprocessing.transform(X_test)

In [None]:
# One-hot encode the y (choice) column
from sklearn.preprocessing import OneHotEncoder

# One-hot encode the y (choice) column
encoder = OneHotEncoder(sparse_output=False)

y_train_prep = encoder.fit_transform(pd.DataFrame(y_train))
y_val_prep = encoder.fit_transform(pd.DataFrame(y_val))
y_test_prep = encoder.fit_transform(pd.DataFrame(y_test))

In [None]:
# Convert data to TensorFlow tensors
X_train_tensor = tf.convert_to_tensor(X_train_prep, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train_prep, dtype=tf.float32)
X_val_tensor = tf.convert_to_tensor(X_val_prep, dtype=tf.float32)
y_val_tensor = tf.convert_to_tensor(y_val_prep, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test_prep, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test_prep, dtype=tf.float32)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(shape=(24,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
def create_model(optimizer='adam', activation='relu'):
  model = tf.keras.Sequential([
  tf.keras.layers.InputLayer(input_shape=(20,)),  # Input layer expecting 20 features per row
  tf.keras.layers.Dense(64, activation=activation),
  tf.keras.layers.Dense(32, activation=activation),
  tf.keras.layers.Dense(3, activation='softmax')  # Output layer with 4 choices (assuming 4 classes in 'choice')
  ])

  # Compile the model
  model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

  return model

In [None]:
!pip install scikeras

In [None]:
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier

keras_classifier = KerasClassifier(model=create_model, verbose=0)

# Define the hyperparameter grid for neural network
param_grid = {
    'model__optimizer': ['adam', 'sgd'],          # Optimizers to try
    'model__activation': ['relu', 'tanh'],        # Activation functions for the hidden layers
    'batch_size': [10, 20],                       # Batch sizes
    'epochs': [10, 20],                           # Number of epochs
}

# GridSearchCV for KerasClassifier
rnd_search = GridSearchCV(estimator=keras_classifier,
                          param_grid=param_grid,
                          scoring='accuracy',
                          cv=3,
                          verbose=3,
                          n_jobs=-1)

# Fit the model
rnd_search.fit(X_train_prep, y_train_prep)

# Print the best parameters and score
print(f"Best: {rnd_search.best_score_} using {rnd_search.best_params_}")

# Evaluate the best model on the test set
best_model = rnd_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

In [None]:
# Train the model
model.fit(X_train_tensor, y_train_tensor, epochs=20, validation_data=(X_val_tensor, y_val_tensor))

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test_tensor, y_test_tensor)
print(f"Test accuracy: {test_accuracy:.2f}")

In [None]:
air_canada_to_predict = pd.read_csv("new_branded_baseline.csv")

columns_to_drop = ['id',
                   'Unnamed: 0',
          'ticket_id',
          'choice',
          'flight_departure_datetime',
          'purchase_datetime']
X_to_predict, y_to_predict = air_canada_to_predict.drop(columns_to_drop, axis=1), air_canada_to_predict[['id']]

In [None]:
X_to_predict_prep = preprocessing.fit_transform(X_to_predict)

In [None]:
predictions = model.predict(X_to_predict_prep)

In [None]:
# prompt: save prediction in a csv file with column named choice_prediction

import numpy as np
import pandas as pd

# Assuming 'prediction' is a NumPy array of predicted probabilities
predicted_classes = np.argmax(predictions, axis=1)

# Create a DataFrame with the predictions
results_df = pd.DataFrame({'choice_prediction': predicted_classes})

# Concatenate the results with the original 'id' column
results_df = pd.concat([y_to_predict, results_df], axis=1)


# Save the DataFrame to a CSV file
results_df.to_csv('prediction.csv', index=False)