In [19]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import os

In [25]:
class PricePredictionPipeline:
    def __init__(self, df, target_col="price"):
        self.df = df.copy()
        self.target_col = target_col
        self.models = {
            "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
            "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
            "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=42),
            "LightGBM": lgb.LGBMRegressor(n_estimators=100, random_state=42),
        }
        self.results = {}
        self.best_model = None
        self.best_model_name = None
        # self.label_encoders = {}
        # self.feature_cols = None
        self.pipeline = None

    def preprocess(self):
        # Convert date
        self.df['date'] = pd.to_datetime(self.df['date'])
        self.df['year'] = self.df['date'].dt.year
        self.df['month'] = self.df['date'].dt.month
        self.df['dayofweek'] = self.df['date'].dt.dayofweek

        # Drop unused columns
        drop_cols = [
            self.target_col,
            "usdprice",
            "date",
            "priceflag",
            "admin2",
            "market_id",
            "commodity_id",
            "latitude",
            "longitude",
        ]

        # # Encode categorical features
        # cat_cols = ['admin1', 'admin2', 'market', 'category', 'commodity',
        #             'unit', 'pricetype', 'currency']
        # for col in cat_cols:
        #     le = LabelEncoder()
        #     self.df[col] = le.fit_transform(self.df[col].astype(str))
        #     self.label_encoders[col] = le  # save encoder for later use

        # Define features and target
        X = self.df.drop(columns=[c for c in drop_cols if c in self.df.columns])
        y = self.df[self.target_col]

        # #identify columns types
        # cat_cols = X.select_dtypes(include="object").columns.tolist()
        # num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
        # self.feature_cols = X.columns.tolist()
        return X, y

    def split(self, X, y, test_size=0.2):
        return train_test_split(X, y, test_size=test_size, shuffle=False)

    def train_and_evaluate(self, X_train, X_test, y_train, y_test):
        # #identify columns types
        cat_cols = X_train.select_dtypes(include="object").columns.tolist()
        num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
        
        #build preprocessing transformer
        preprocessor = ColumnTransformer(
            transformers=[
                ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
                ("num", "passthrough", num_cols)
            ]
        )
        
        for name, model in self.models.items():
            try:
                pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
                pipeline.fit(X_train, y_train)
                # model.fit(X_train, y_train)
                y_pred = pipeline.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                r2score = r2_score(y_test, y_pred)
                self.results[name] = {"MAE": mae, "RMSE": rmse, "R2 Score": r2score}
                self.models[name] = pipeline
            except Exception as e:
                print(f"Failed to train {name}: {e}")

        if not self.results:
            raise RuntimeError("❌ All models failed to train. Please check data or library versions.")

        self.results = pd.DataFrame(self.results).T
        self.best_model_name = self.results['RMSE'].idxmin()
        self.best_model = self.models[self.best_model_name]
        self.pipeline = self.best_model
        return self.results

    def plot_predictions(self, X_test, y_test):
        if not self.best_model:
            raise ValueError("No best model found. Run train_and_evaluate() first.")
        y_pred = self.best_model.predict(X_test)
        plt.figure(figsize=(12,6))
        plt.plot(y_test.values, label="Actual", alpha=0.7)
        plt.plot(y_pred, label=f"Predicted ({self.best_model_name})", alpha=0.7)
        plt.legend()
        plt.title("Actual vs Predicted Prices")
        plt.show()

    def save_best_model(self, filename="best_model.pkl"):
        if not self.pipeline:
            raise ValueError("No trained model found. Run train_and_evaluate() first.")

        # package = {
        #     "model": self.best_model,
        #     "model_name": self.best_model_name,
        #     "label_encoders": self.label_encoders,
        #     "feature_cols": self.feature_cols,
        # }
        joblib.dump(self.pipeline, filename)
        print(f"✅ Best model ({self.best_model_name}) saved as {filename}")

    def load_model(self, filename="best_model.pkl"):
        if not os.path.exists(filename):
            raise FileNotFoundError(f"{filename} not found.")
        self.pipeline = joblib.load(filename)
        # self.best_model = package["model"]
        # self.best_model_name = package["model_name"]
        # self.label_encoders = package["label_encoders"]
        # self.feature_cols = package["feature_cols"]
        print(f"✅ Loaded pipeline from {filename}")
        return self.pipeline

    # def prepare_new_data(self, new_df):
    #     """Preprocess new unseen data using saved encoders & features"""
    #     new_df = new_df.copy()
    #     new_df['date'] = pd.to_datetime(new_df['date'])
    #     new_df['year'] = new_df['date'].dt.year
    #     new_df['month'] = new_df['date'].dt.month
    #     new_df['dayofweek'] = new_df['date'].dt.dayofweek

    #     # Apply saved label encoders
    #     for col, le in self.label_encoders.items():
    #         if col in new_df:
    #             new_df[col] = new_df[col].map(lambda s: s if s in le.classes_ else "<UNK>")
    #             le_classes = np.append(le.classes_, "<UNK>")
    #             le.classes_ = le_classes
    #             new_df[col] = le.transform(new_df[col].astype(str))

    #     X_new = new_df[self.feature_cols]
    #     return X_new

    def predict_new(self, new_df):
        if not self.pipeline:
            raise ValueError("No pipeline loaded. Train or load a model first.")
        # Date feature engineering
        new_df = new_df.copy()
        new_df["date"] = pd.to_datetime(new_df["date"])
        new_df["year"] = new_df["date"].dt.year
        new_df["month"] = new_df["date"].dt.month
        new_df["dayofweek"] = new_df["date"].dt.dayofweek
        new_df = new_df.drop(columns=["date"], errors="ignore")
        return self.best_model.predict(new_df)


In [37]:
# df = pd.read_csv("../data/raw/nigerian_food_prices_2002_2025.csv", skiprows=[1])

# # Inspect
# df.head()


In [29]:
# predictor = AdvancePricePredictor(model_save_path="best_price_model.pkl")
# predictor = AdvancePricePredictor(target_column="price", time_column="date")

In [33]:
# best_result = predictor.train_and_select_best(df)
# print("Best Model", best_result['best_model'])
# print("Performance", best_result['metrics'])


In [None]:
# comparison_df = predictor.get_model_comparison()
# comparison_df

In [34]:
# new_data = df.tail(10)
# predictions = predictor.predict_prices(new_data)

# print("Predicted Prices:", predictions)

In [26]:
# ---------------- USAGE ---------------- #
df = pd.read_csv("../data/raw/nigerian_food_prices_2002_2025.csv", skiprows=[1])
pipeline = PricePredictionPipeline(df)
X, y = pipeline.preprocess()
X_train, X_test, y_train, y_test = pipeline.split(X, y)
results = pipeline.train_and_evaluate(X_train, X_test, y_train, y_test)
print(results)
pipeline.save_best_model("../models/saved_models/food_price_model.pkl")   # Save with preprocessing

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 256
[LightGBM] [Info] Number of data points in the train set: 48452, number of used features: 128
[LightGBM] [Info] Start training from score 5256.745966
                          MAE         RMSE  R2 Score
RandomForest      2489.575227  5391.772992  0.752258
GradientBoosting  2623.988663  5713.382932  0.721822
XGBoost           2498.315925  5396.919361  0.751785
LightGBM          2499.253006  5408.687012  0.750701
✅ Best model (RandomForest) saved as ../models/saved_models/food_price_model.pkl




In [14]:
# Later, for prediction on unseen data:
pipeline.load_model("../models/saved_models/food_price_model.pkl")
new_data = pd.DataFrame([{
    "date": "2025-01-15",
    "admin1": "Katsina",
    # "admin2": "Jibia",
    "market": "Jibia (CBM)",
    # "market_id": 1038,
    # "latitude": 13.08,
    # "longitude": 7.24,
    "category": "cereals and tubers",
    "commodity": "Maize",
    # "commodity_id": 51,
    "unit": "50KG",
    # "priceflag": "actual",
    "pricetype": "Wholesale",
    "currency": "NGN",
    # "usdprice": 1.54
}])
preds = pipeline.predict_new(new_data)
print("Predicted Price:", preds)

✅ Loaded RandomForest from ../models/saved_models/food_price_model.pkl
Predicted Price: [23467.6075]


In [27]:
# Load the saved model file
model_path = "../models/saved_models/food_price_model.pkl"
model = joblib.load(model_path)

print("Type of object saved:", type(model))
print("\nAttributes available:", dir(model))

# If it's a pipeline, check the steps
if hasattr(model, "steps"):
    print("\nPipeline steps:")
    for step in model.steps:
        print(step)

# If it's RandomForest or another estimator
if hasattr(model, "get_params"):
    print("\nModel parameters:")
    print(model.get_params())


Type of object saved: <class 'sklearn.pipeline.Pipeline'>

Attributes available: ['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__sklearn_is_fitted__', '__sklearn_tags__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_build_request_for_signature', '_can_fit_transform', '_can_inverse_transform', '_can_transform', '_check_method_params', '_doc_link_module', '_doc_link_template', '_doc_link_url_param_generator', '_estimator_type', '_final_estimator', '_fit', '_get_default_requests', '_get_doc_link', '_get_metadata_for_step', '_get_metadata_request', '_get_param_names', '_get_params', '_get_params_html', '_h

In [28]:

# Load model
model = joblib.load("../models/saved_models/food_price_model.pkl")

# Print type
print("Type of saved object:", type(model))

# If it's a pipeline
if hasattr(model, "steps"):
    print("\n✅ It's a pipeline. Steps inside:")
    for name, step in model.steps:
        print(f" - {name}: {step}")
else:
    print("\n⚠️ Not a pipeline. Likely a raw model:")
    print(model)

# Print available attributes
print("\nAttributes / Methods available:")
print(dir(model)[:50])  # just first 50 to avoid too long output


Type of saved object: <class 'sklearn.pipeline.Pipeline'>

✅ It's a pipeline. Steps inside:
 - preprocessor: ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['admin1', 'market', 'category', 'commodity',
                                  'unit', 'pricetype', 'currency']),
                                ('num', 'passthrough', [])])
 - model: RandomForestRegressor(random_state=42)

Attributes / Methods available:
['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__sklearn_is_fitted__', '__sklearn_tags__', '__str__', '__subclasshook__', '__weakref__', '_abc_imp