In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import os
import sys
import logging
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

In [3]:
sys.path.insert(0, os.path.abspath('..'))
# Load scripts from path
sys.path.append('/home/olani/Documents/kifiya/week4/Rossmann-store-sales/scripts')

In [4]:
from scripts.data_loader import load_data
from scripts.utils import setup_logger

In [5]:
# Ignore some future warnings triggered when training
import warnings
warnings.filterwarnings(action="ignore", category=FutureWarning)

In [6]:
# Set up logging
setup_logger()

<RootLogger root (INFO)>

In [7]:
# Load data
train_df = load_data('../data/processed/train_df.csv')
test_df = load_data('../data/processed/test_df.csv')
store_df = load_data('../data/processed/store_df.csv')

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007478 entries, 0 to 1007477
Data columns (total 23 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1007478 non-null  int64  
 1   DayOfWeek                  1007478 non-null  int64  
 2   Date                       1007478 non-null  object 
 3   Sales                      1007478 non-null  int64  
 4   Customers                  1007478 non-null  int64  
 5   Open                       1007478 non-null  int64  
 6   Promo                      1007478 non-null  int64  
 7   StateHoliday               1007478 non-null  object 
 8   SchoolHoliday              1007478 non-null  int64  
 9   Year                       1007478 non-null  int64  
 10  Month                      1007478 non-null  int64  
 11  Day                        1007478 non-null  int64  
 12  BeforeHoliday              1007478 non-null  object 
 13  AfterHoliday

In [9]:
train_df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,...,AfterHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,2015,...,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,
1,2,5,2015-07-31,6064,625,1,1,0,1,2015,...,0,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,5,2015-07-31,8314,821,1,1,0,1,2015,...,0,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,5,2015-07-31,13995,1498,1,1,0,1,2015,...,0,c,c,620.0,9.0,2009.0,0,0.0,0.0,
4,5,5,2015-07-31,4822,559,1,1,0,1,2015,...,0,a,a,29910.0,4.0,2015.0,0,0.0,0.0,


In [10]:
train_df.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Day',
       'BeforeHoliday', 'AfterHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval'],
      dtype='object')

In [11]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])

In [12]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007478 entries, 0 to 1007477
Data columns (total 23 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   Store                      1007478 non-null  int64         
 1   DayOfWeek                  1007478 non-null  int64         
 2   Date                       1007478 non-null  datetime64[ns]
 3   Sales                      1007478 non-null  int64         
 4   Customers                  1007478 non-null  int64         
 5   Open                       1007478 non-null  int64         
 6   Promo                      1007478 non-null  int64         
 7   StateHoliday               1007478 non-null  object        
 8   SchoolHoliday              1007478 non-null  int64         
 9   Year                       1007478 non-null  int64         
 10  Month                      1007478 non-null  int64         
 11  Day                        1007478 no

### Generate New Features from DateTime Columns

Weekday

In [13]:
# Extracting the weekday (0 = Monday, 6 = Sunday)
train_df['Weekday'] = train_df['Date'].dt.weekday
test_df['Weekday'] = test_df['Date'].dt.weekday

weekends

In [14]:
# Binary feature for weekends (1 = weekend, 0 = weekday)
train_df['Is_Weekend'] = train_df['Weekday'].apply(lambda x: 1 if x >= 5 else 0)
test_df['Is_Weekend'] = test_df['Weekday'].apply(lambda x: 1 if x >= 5 else 0)


In [15]:
import holidays
# Initialize the holidays object for a specific country
us_holidays = holidays.US()

# with a specific year range
years = train_df['Date'].dt.year.unique()
us_holidays = holidays.US(years=years)

Number of Days to Holidays

In [16]:
# Function to compute the number of days to the next holiday
def days_to_next_holiday(date, holiday_dates):
    future_holidays = [holiday for holiday in holiday_dates if holiday > date]
    if future_holidays:
        return (min(future_holidays) - date).days
    else:
        return np.nan

# Get the list of holiday dates from the holiday object
holiday_dates = pd.to_datetime(list(us_holidays.keys()))

# Apply the function to calculate the number of days to the next holiday
train_df['Days_To_Holiday'] = train_df['Date'].apply(lambda x: days_to_next_holiday(x, holiday_dates))
test_df['Days_To_Holiday'] = test_df['Date'].apply(lambda x: days_to_next_holiday(x, holiday_dates))

Number of Days After a Holiday

In [17]:
# Function to compute the number of days since the last holiday
def days_after_last_holiday(date, holiday_dates):
    past_holidays = [holiday for holiday in holiday_dates if holiday < date]
    if past_holidays:
        return (date - max(past_holidays)).days
    else:
        return np.nan

# Apply the function to calculate the number of days after the last holiday
train_df['Days_After_Holiday'] = train_df['Date'].apply(lambda x: days_after_last_holiday(x, holiday_dates))
test_df['Days_After_Holiday'] = test_df['Date'].apply(lambda x: days_after_last_holiday(x, holiday_dates))

Beginning, Mid, and End of the Month

In [18]:
# Function to categorize beginning, mid, and end of the month
def month_phase(day):
    if day <= 10:
        return 'Beginning'
    elif 11 <= day <= 20:
        return 'Mid'
    else:
        return 'End'

# Applying to the Date column
train_df['Month_Phase'] = train_df['Date'].dt.day.apply(month_phase)
test_df['Month_Phase'] = test_df['Date'].dt.day.apply(month_phase)

Month Start/End

In [19]:
# Binary features for month start and end
train_df['Is_Month_Start'] = train_df['Date'].dt.is_month_start.astype(int)
test_df['Is_Month_Start'] = test_df['Date'].dt.is_month_start.astype(int)

train_df['Is_Month_End'] = train_df['Date'].dt.is_month_end.astype(int)
test_df['Is_Month_End'] = test_df['Date'].dt.is_month_end.astype(int)

Quarter

In [20]:
# Extracting the quarter
train_df['Quarter'] = train_df['Date'].dt.quarter
test_df['Quarter'] = test_df['Date'].dt.quarter

Day of the Year

In [21]:
# Extracting the day of the year
train_df['Day_of_Year'] = train_df['Date'].dt.dayofyear
test_df['Day_of_Year'] = test_df['Date'].dt.dayofyear

### Scaling Data:

In [32]:
from sklearn.preprocessing import StandardScaler

# Select all numeric features for scaling
numeric_features = ['Days_To_Holiday', 'Days_After_Holiday', 'Day_of_Year']

scaler = StandardScaler()

# Fit and transform on train data
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])

# Only transform on test data
test_df[numeric_features] = scaler.transform(test_df[numeric_features])

### Building Models with sklearn Pipelines

#### Separate Features (X_train) and Target (y_train) in train_df

In [18]:
# X_train: All columns except 'Sales' in train_df
X_train = train_df.drop(columns=['Sales'])

# y_train: The target variable (Sales) in train_df
y_train = train_df['Sales']

In [19]:
# X_test: All columns in test_df (without 'Sales' because it's unknown in test data)
X_test = test_df  # assuming 'Sales' is not in test_df

In [None]:
# Create a pipeline (with scaling and RandomForestRegressor model)
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardizing features
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Fit the pipeline on training data (X_train, y_train)
pipeline.fit(X_train, y_train)

# Make predictions on test_df (X_test)
y_test_pred = pipeline.predict(X_test)

# y_test_pred contains the predicted sales for the test_df
print(y_test_pred)