In [2]:
# notebooks/02_ml_modeling.ipynb

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import joblib
import logging
import os
from datetime import datetime

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define class for ML Model
class MLModel:
    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.model = None
        self.pipeline = None

    def load_data(self):
        dtype_spec = {
            'StateHoliday': str,
            'StoreType': str,
            'Assortment': str,
            'PromoInterval': str
        }
        self.train = pd.read_csv(self.train_path, dtype=dtype_spec)
        self.test = pd.read_csv(self.test_path, dtype=dtype_spec)
        self.X_train = self.train.drop(columns=['Sales'])
        self.y_train = self.train['Sales']
        self.X_test = self.test.copy()  # Sales column not available in test data
        logger.info("Data loaded successfully")

    def preprocess(self):
        # Identify numerical and categorical columns
        numerical_cols = self.X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_cols = self.X_train.select_dtypes(include=['object', 'bool']).columns.tolist()

        # Ensure all categorical columns are of string type
        self.X_train[categorical_cols] = self.X_train[categorical_cols].astype(str)
        self.X_test[categorical_cols] = self.X_test[categorical_cols].astype(str)

        # Define preprocessing steps
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Bundle preprocessing for numerical and categorical data
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_cols),
                ('cat', categorical_transformer, categorical_cols)
            ])

        self.pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                        ('model', RandomForestRegressor(n_estimators=100, random_state=42))])
        logger.info("Preprocessing pipeline created")

    def build_model(self):
        self.pipeline.fit(self.X_train, self.y_train)
        logger.info("Model trained successfully")

    def evaluate_model(self):
        predictions = self.pipeline.predict(self.X_train)
        mse = mean_squared_error(self.y_train, predictions)
        rmse = mean_squared_error(self.y_train, predictions, squared=False)
        logger.info(f"Model evaluation complete. RMSE: {rmse}")
        return rmse

    def save_model(self):
        model_dir = 'data/models'
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        model_path = os.path.join(model_dir, f"model-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.pkl")
        joblib.dump(self.pipeline, model_path)
        logger.info(f"Model saved at {model_path}")

    def run(self):
        self.load_data()
        self.preprocess()
        self.build_model()
        rmse = self.evaluate_model()
        self.save_model()
        return rmse

# Create instance of MLModel
model = MLModel('train_processed.csv', 'test_processed.csv')

# Run the model pipeline
rmse = model.run()
print(f"Model RMSE: {rmse}")

Model RMSE: 0.053470820942109865


In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import logging
import os
from datetime import datetime

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define class for ML Model
class MLModel:
    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.model = None
        self.pipeline = None

    def load_data(self):
        dtype_spec = {
            'StateHoliday': str,
            'StoreType': str,
            'Assortment': str,
            'PromoInterval': str
        }
        self.train = pd.read_csv(self.train_path, dtype=dtype_spec)
        self.test = pd.read_csv(self.test_path, dtype=dtype_spec)
        self.X_train = self.train.drop(columns=['Sales'])
        self.y_train = self.train['Sales']
        self.X_test = self.test.copy()  # Sales column not available in test data
        logger.info("Data loaded successfully")

    def preprocess(self):
        # Extract datetime features
        def extract_date_features(df):
            df['Date'] = pd.to_datetime(df['Date'])
            df['Year'] = df['Date'].dt.year
            df['Month'] = df['Date'].dt.month
            df['Day'] = df['Date'].dt.day
            df['WeekOfYear'] = df['Date'].dt.isocalendar().week
            df['Weekday'] = df['Date'].dt.weekday
            df['IsWeekend'] = df['Weekday'] >= 5
            df['MonthStart'] = df['Day'] <= 10
            df['MonthMid'] = (df['Day'] > 10) & (df['Day'] <= 20)
            df['MonthEnd'] = df['Day'] > 20
            return df

        self.X_train = extract_date_features(self.X_train)
        self.X_test = extract_date_features(self.X_test)

        # Identify numerical and categorical columns
        numerical_cols = self.X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_cols = self.X_train.select_dtypes(include=['object', 'bool']).columns.tolist()

        # Ensure all categorical columns are of string type
        self.X_train[categorical_cols] = self.X_train[categorical_cols].astype(str)
        self.X_test[categorical_cols] = self.X_test[categorical_cols].astype(str)

        # Define preprocessing steps
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Bundle preprocessing for numerical and categorical data
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_cols),
                ('cat', categorical_transformer, categorical_cols)
            ])

        self.pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                        ('model', RandomForestRegressor(n_estimators=100, random_state=42))])
        logger.info("Preprocessing pipeline created")

    def build_model(self):
        self.pipeline.fit(self.X_train, self.y_train)
        logger.info("Model trained successfully")

    def evaluate_model(self):
        predictions = self.pipeline.predict(self.X_train)
        mse = mean_squared_error(self.y_train, predictions)
        rmse = mean_squared_error(self.y_train, predictions, squared=False)
        logger.info(f"Model evaluation complete. RMSE: {rmse}")
        return rmse

    def save_model(self):
        model_dir = '../data/models'
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        model_path = os.path.join(model_dir, f"model-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.pkl")
        joblib.dump(self.pipeline, model_path)
        logger.info(f"Model saved at {model_path}")

    def run(self):
        self.load_data()
        self.preprocess()
        self.build_model()
        rmse = self.evaluate_model()
        self.save_model()
        return rmse

# Create instance of MLModel
model = MLModel('train_processed.csv', 'test_processed.csv')

# Run the model pipeline
rmse = model.run()
print(f"Model RMSE: {rmse}")

Model RMSE: 0.04896290100204645
