In [11]:
import subprocess
subprocess.check_call(["pip", "install", "xgboost"])

import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import plot_roc_curve
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC



In [7]:
trip = pd.read_csv("../data/clean/trip.csv")

# Drop any columns with the string "id" or "flag" in them
for col in trip:
  if "id" in col:
    trip = trip.drop(col, axis=1)

# Drop these columns: 'reason_for_travel_to', 'why_trip', 'trip_purpose_old_schema'
trip = trip.drop(['trip_purpose_old_schema', 'why_trip', 'reason_for_travel_to'], axis=1)

# Prepare trip data for model
label_encoder = LabelEncoder()
scaler = MinMaxScaler()

for col in trip.columns.drop('trip_purpose'):
  if trip[col].dtype == object:
    trip[col] = label_encoder.fit_transform(trip[col])
  else: # Normalize
    trip[col] = scaler.fit_transform(trip[col].values.reshape(-1, 1))

# Remove rows where trip purpose is "Not Ascertained"
trip = trip[trip["trip_purpose"] != "Not ascertained"]
trip = trip.reset_index(drop=True)

vis = trip["trip_purpose"].astype('category')

trip["trip_purpose"] = label_encoder.fit_transform(trip["trip_purpose"])
target = trip["trip_purpose"].to_numpy()
features = trip.drop('trip_purpose', axis=1).to_numpy()

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

In [8]:
# Create validation data as 10% of training data
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1)

# Create LSTM model
model = keras.Sequential()
model.add(layers.Input(shape=(features.shape[1],)))
# add convolutional layer
model.add(layers.Dense(64, activation='sigmoid'))
model.add(layers.Dense(64, activation='sigmoid'))
model.add(layers.Dense(5, activation='softmax'))

model.compile(optimizer=keras.optimizers.legacy.Adam(learning_rate=0.005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(
    x_train,
    y_train,
    epochs=500,
    batch_size=32,
    callbacks=keras.callbacks.EarlyStopping(patience=2),
    validation_data=(x_val, y_val)
)

predictions = model.predict(x_test, verbose=0)
NN_preds = np.argmax(predictions, axis=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500


In [9]:
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

lr_model = LogisticRegression()

grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
logit_pred = best_model.predict(x_test)



In [13]:
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

model = xgb.XGBClassifier()
grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=20, scoring='accuracy', cv=3)
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_

xgboost_model = xgb.XGBClassifier(**best_params)
xgboost_model.fit(x_train, y_train)

agboost_pred = xgboost_model.predict(x_test)


In [None]:
best_c = 10
best_kernel = "poly"
svc = SVC(C=best_c, kernel=best_kernel)
svc.fit(x_train, y_train)

SVM_pred = svc.predict(x_test)

NameError: name 'x_train' is not defined

In [None]:
models = [model, best_model, xgboost_model, svc]
fig, ax = plt.subplots(figsize=(10, 8))
for x in models:
    plot_roc_curve(x, x_test, y_test, ax=ax, name=type(model).__name__)

ax.set_title('ROC Curve for Multi-Class Classification')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend()
plt.show()