<a href="https://colab.research.google.com/github/nkubana0/FuelTrend/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [89]:
#Imports
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

In [90]:
#Load data
def load_data():
    gasoline_url = "https://prod-energy-fuel-prices.s3.amazonaws.com/wholesalegasolineprices.csv"
    diesel_url = "https://prod-energy-fuel-prices.s3.amazonaws.com/wholesaledieselprices.csv"

    gas_df = pd.read_csv(gasoline_url)
    diesel_df = pd.read_csv(diesel_url)

    gas_df['Date'] = pd.to_datetime(gas_df['Date'], errors='coerce')
    diesel_df['Date'] = pd.to_datetime(diesel_df['Date'], errors='coerce')

    gas_df.rename(columns={
        "Date": "date",
        "Day-of Toronto Wholesale Gasoline  / Prix de gros de l’essence à Toronto": "gasoline_price"
    }, inplace=True)

    diesel_df.rename(columns={
        "Date": "date",
        "Day-of Toronto Wholesale Diesel  / Prix de gros du diesel à Toronto": "diesel_price"
    }, inplace=True)

    df = pd.merge(gas_df[['date', 'gasoline_price']], diesel_df[['date', 'diesel_price']], on='date', how='inner')

    return df

In [91]:
# 2. Feature Engineering
def feature_engineering(df):
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df = df.ffill()

    df['price_category'] = pd.qcut(df['gasoline_price'], q=3, labels=[0, 1, 2])

    print("Class balance (price_category):")
    print(df['price_category'].value_counts())
    print(df['price_category'].value_counts(normalize=True))

    return df

In [92]:
# Prepare X and y
def prepare_xy(df, target_column="price_category"):
    X = df.drop(columns=["date", "gasoline_price", target_column])
    y = df[target_column].astype(int)
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [93]:
# Save models
def save_sklearn_model(model, name):
    os.makedirs("saved_models", exist_ok=True)
    joblib.dump(model, f"saved_models/{name}.pkl")

def save_keras_model(model, name):
    os.makedirs("saved_models", exist_ok=True)
    model.save(f"saved_models/{name}.h5")

In [94]:
# Evaluate model
def evaluate_model(model, X_val, y_val):
    preds = model.predict(X_val)
    if preds.ndim > 1:
        preds = np.argmax(preds, axis=1)
    acc = accuracy_score(y_val, preds)
    prec = precision_score(y_val, preds, average='macro')
    rec = recall_score(y_val, preds, average='macro')
    f1 = f1_score(y_val, preds, average='macro')
    return acc, prec, rec, f1

In [95]:
#ML model
def train_logistic_regression(X_train, y_train, X_val, y_val):
    model = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', max_iter=500)
    model.fit(X_train, y_train)
    return model, evaluate_model(model, X_val, y_val)

In [96]:
#NN simple
def build_simple_nn(input_dim, num_classes):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [97]:
#NN optimized
def build_optimized_nn(input_dim, optimizer, regularizer=None, dropout_rate=None, num_classes=3):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(128, activation='relu', kernel_regularizer=regularizer))
    if dropout_rate:
        model.add(Dropout(dropout_rate))
    model.add(Dense(64, activation='relu', kernel_regularizer=regularizer))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [99]:
# Main execution
if __name__ == '__main__':
    df = load_data()
    df = feature_engineering(df)

    # Train/Val/Test split
    train_df, test_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['price_category'])
    X_train, X_val, y_train, y_val = prepare_xy(train_df, target_column="price_category")
    X_test = test_df.drop(columns=["date", "gasoline_price", "price_category"])
    y_test = test_df["price_category"].astype(int)

    # Logistic Regression
    lr_model, (acc, prec, rec, f1) = train_logistic_regression(X_train, y_train, X_val, y_val)
    print("Logistic Regression:", acc, prec, rec, f1)
    save_sklearn_model(lr_model, "logistic_regression_model")

    # NN Simple
    nn_simple = build_simple_nn(X_train.shape[1], 3)
    nn_simple.fit(X_train, y_train, epochs=10, verbose=0)
    acc, prec, rec, f1 = evaluate_model(nn_simple, X_val, y_val)
    print("NN Simple:", acc, prec, rec, f1)
    save_keras_model(nn_simple, "nn_simple")

    # NN Optimized Instance 1
    nn1 = build_optimized_nn(X_train.shape[1], optimizer=tf.keras.optimizers.Adam())
    nn1.fit(X_train, y_train, epochs=20, verbose=0)
    acc, prec, rec, f1 = evaluate_model(nn1, X_val, y_val)
    print("NN Opt 1:", acc, prec, rec, f1)
    save_keras_model(nn1, "nn_optimized_1")

    # NN Optimized Instance 2
    nn2 = build_optimized_nn(X_train.shape[1], optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
    es = EarlyStopping(monitor='val_loss', patience=5)
    nn2.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), callbacks=[es], verbose=0)
    acc, prec, rec, f1 = evaluate_model(nn2, X_val, y_val)
    print("NN Opt 2:", acc, prec, rec, f1)
    save_keras_model(nn2, "nn_optimized_2")

    # NN Optimized Instance 3
    nn3 = build_optimized_nn(
        X_train.shape[1],
        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0005),
        regularizer=regularizers.l2(0.001),
        dropout_rate=0.2
    )
    nn3.fit(X_train, y_train, epochs=75, validation_data=(X_val, y_val), callbacks=[es], verbose=0)
    acc, prec, rec, f1 = evaluate_model(nn3, X_val, y_val)
    print("NN Opt 3:", acc, prec, rec, f1)
    save_keras_model(nn3, "nn_optimized_3")

    # NN Optimized Instance 4
    nn4 = build_optimized_nn(
        X_train.shape[1],
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        regularizer=regularizers.l2(0.01),
        dropout_rate=0.3
    )
    nn4.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[es], verbose=0)
    acc, prec, rec, f1 = evaluate_model(nn4, X_val, y_val)
    print("NN Opt 4:", acc, prec, rec, f1)
    save_keras_model(nn4, "nn_optimized_4")

    # Predict on test set with best model (Example with LR)
    lr_preds = lr_model.predict(X_test)
    test_acc = accuracy_score(y_test, lr_preds)
    print("Test Accuracy (LR):", test_acc)

Class balance (price_category):
price_category
0    470
2    463
1    456
Name: count, dtype: int64
price_category
0    0.338373
2    0.333333
1    0.328294
Name: proportion, dtype: float64
Logistic Regression: 0.7245762711864406 0.7239090263555434 0.722542735042735 0.7231070815976476
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NN Simple: 0.6483050847457628 0.4468590076392494 0.6456196581196582 0.5235057755560762




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 




NN Opt 1: 0.6567796610169492 0.805835587246947 0.6541666666666667 0.5758808063102542




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NN Opt 2: 0.576271186440678 0.4012745098039216 0.5729700854700854 0.4689858082381447
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NN Opt 3: 0.4745762711864407 0.46204620462046203 0.47500000000000003 0.3845446950710108
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NN Opt 4: 0.3305084745762712 0.11016949152542373 0.3333333333333333 0.16560509554140126
Test Accuracy (LR): 0.7129186602870813
