<a href="https://colab.research.google.com/github/nkubana0/FuelTrend/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [68]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [69]:
# Classical ML
from sklearn.linear_model import LogisticRegression

In [70]:
# Neural Networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

In [71]:
# 1. Load and preprocess data
def load_data():
    gasoline_url = "https://prod-energy-fuel-prices.s3.amazonaws.com/wholesalegasolineprices.csv"
    diesel_url = "https://prod-energy-fuel-prices.s3.amazonaws.com/wholesaledieselprices.csv"

    gas_df = pd.read_csv(gasoline_url)
    diesel_df = pd.read_csv(diesel_url)

    print("Gasoline DF columns:", gas_df.columns.tolist())
    print("Diesel DF columns:", diesel_df.columns.tolist())

    gas_df['Date'] = pd.to_datetime(gas_df['Date'], errors='coerce')
    diesel_df['Date'] = pd.to_datetime(diesel_df['Date'], errors='coerce')

    gas_df.rename(columns={
        "Date": "date",
        "Day-of Toronto Wholesale Gasoline  / Prix de gros de l’essence à Toronto": "gasoline_price"
        }, inplace=True)
    diesel_df.rename(columns={
        "Date": "date",
        "Day-of Toronto Wholesale Diesel  / Prix de gros du diesel à Toronto": "diesel_price"
        }, inplace=True)

    df = pd.merge(gas_df[['date', 'gasoline_price']], diesel_df[['date', 'diesel_price']], on='date', how='inner')

    return df

In [72]:
# 2. Feature Engineering
def feature_engineering(df):
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df = df.ffill()

    # 3-class target: Low / Medium / High gasoline price
    df['price_category'] = pd.qcut(df['gasoline_price'], q=3, labels=[0, 1, 2])

    # Show class balance
    print("Class balance (price_category):")
    print(df['price_category'].value_counts())
    print(df['price_category'].value_counts(normalize=True))

    return df

In [73]:
# Prepare X and y
def prepare_xy(df, target_column="price_category"):
    X = df.drop(columns=["date", "gasoline_price", target_column])
    y = df[target_column].astype(int)
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [74]:
# Classical ML: XGBClassifier
def train_logistic_regression(X_train, y_train, X_val, y_val):
    model = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', max_iter=500, multi_class='multinomial')
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    prec = precision_score(y_val, preds, average='macro')
    rec = recall_score(y_val, preds, average='macro')
    f1 = f1_score(y_val, preds, average='macro')
    return model, acc, prec, rec, f1

In [75]:
# Neural Network — Simple
def build_simple_nn(input_dim, num_classes):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [76]:
# Neural Network — Optimized
def build_optimized_nn(input_dim, optimizer, regularizer=None, dropout_rate=None, num_classes=3):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(128, activation='relu', kernel_regularizer=regularizer))
    if dropout_rate:
        model.add(Dropout(dropout_rate))
    model.add(Dense(64, activation='relu', kernel_regularizer=regularizer))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [77]:
# Save models
def save_sklearn_model(model, name):
    os.makedirs("saved_models", exist_ok=True)
    joblib.dump(model, f"saved_models/{name}.pkl")

def save_keras_model(model, name):
    os.makedirs("saved_models", exist_ok=True)
    model.save(f"saved_models/{name}.h5")

In [78]:
# Evaluate model
def evaluate_model(model, X_val, y_val):
    preds = model.predict(X_val)
    if preds.ndim > 1:  # NN outputs
        preds = np.argmax(preds, axis=1)
    acc = accuracy_score(y_val, preds)
    prec = precision_score(y_val, preds, average='macro')
    rec = recall_score(y_val, preds, average='macro')
    f1 = f1_score(y_val, preds, average='macro')
    return acc, prec, rec, f1

In [79]:
# Main execution
if __name__ == '__main__':
    # Load and prepare data
    df = load_data()
    df = feature_engineering(df)
    X_train, X_val, y_train, y_val = prepare_xy(df, target_column="price_category")

    # Logistic Regression
    lr_model, acc, prec, rec, f1 = train_logistic_regression(X_train, y_train, X_val, y_val)
    print("Logistic Regression Results:", acc, prec, rec, f1)
    save_sklearn_model(lr_model, "logistic_regression_model")

    # Simple NN
    nn_simple = build_simple_nn(X_train.shape[1], num_classes=3)
    nn_simple.fit(X_train, y_train, epochs=10, verbose=0)
    acc, prec, rec, f1 = evaluate_model(nn_simple, X_val, y_val)
    print("NN Simple Results:", acc, prec, rec, f1)
    save_keras_model(nn_simple, "nn_simple")

    # Optimized NN — Instance 1
    nn_opt1 = build_optimized_nn(
        X_train.shape[1],
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        regularizer=None,
        dropout_rate=None
    )
    es = EarlyStopping(monitor='val_loss', patience=5)
    nn_opt1.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), callbacks=[es], verbose=0)
    acc, prec, rec, f1 = evaluate_model(nn_opt1, X_val, y_val)
    print("NN Optimized 1 Results:", acc, prec, rec, f1)
    save_keras_model(nn_opt1, "nn_optimized_1")

    # Optimized NN — Instance 2
    nn_opt2 = build_optimized_nn(
        X_train.shape[1],
        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0005),
        regularizer=regularizers.l2(0.001),
        dropout_rate=0.2
    )
    nn_opt2.fit(X_train, y_train, epochs=75, validation_data=(X_val, y_val), callbacks=[es], verbose=0)
    acc, prec, rec, f1 = evaluate_model(nn_opt2, X_val, y_val)
    print("NN Optimized 2 Results:", acc, prec, rec, f1)
    save_keras_model(nn_opt2, "nn_optimized_2")

    # You can add more optimized instances (3, 4, 5) the same way!

Gasoline DF columns: ['Date', 'Day-of Toronto Wholesale Gasoline  / Prix de gros de l’essence à Toronto', 'Prior Day NY Harbor Spot / Détaillant au port de NY le jour précédent', 'Day-of Toronto less Prior Day NY Harbor / Prix de gros à Toronto moins le prix au port de NY', 'Day-of Thunder Bay Wholesale Gasoline  / Prix de gros de l’essence à Thunder Bay', 'Prior Day Edmonton Spot / Détaillant au Edmonton le jour précédent', 'Day-of Thunder Bay less Prior Day Edmonton / Prix de gros à Thunder Bay moins le prix au Edmonton']
Diesel DF columns: ['Date', 'Day-of Toronto Wholesale Diesel  / Prix de gros du diesel à Toronto', 'Prior Day NY Harbor Spot / Détaillant au port de NY le jour précédent', 'Day-of Toronto less Prior Day NY Harbor / Prix de gros à Toronto moins le prix au port de NY', 'Day-of Thunder Bay Wholesale Diesel  / Prix de gros du diesel à Thunder Bay', 'Prior Day Edmonton Spot / Détaillant au Edmonton le jour précédent', 'Day-of Thunder Bay less Prior Day Edmonton / Prix de



[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 




NN Simple Results: 0.60431654676259 0.6128162858952976 0.6028176048766303 0.5680104934021175
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NN Optimized 1 Results: 0.5539568345323741 0.3847903176071539 0.5484252268740945 0.44049038843791277
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step




NN Optimized 2 Results: 0.5575539568345323 0.7033509700176367 0.5567740427040359 0.46182465642072623
