In [44]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

In [2]:
# Loading data
store_data = pd.read_csv('Data/WMdata_with_holidays copy.csv')
store_data.head()

Unnamed: 0.1,Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Name
0,0,1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,
1,1,1,2010-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,Valentine's Day
2,2,1,2010-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,
3,3,1,2010-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,
4,4,1,2010-03-05,1554806.68,0,46.5,2.625,211.350143,8.106,


In [3]:
# Check for duplicates
duplicates = store_data[store_data.duplicated()]
if not duplicates.empty:
    print("Duplicate rows found:")
    print(duplicates)
else:
    print("No duplicate rows found")

# Check for missing values
missing_values = store_data.isnull().sum()
if missing_values.any():
    print("\nMissing values found:")
    print(missing_values)
else:
    print("\nNo missing values found")

No duplicate rows found

Missing values found:
Unnamed: 0         0
Store              0
Date               0
Weekly_Sales       0
Holiday_Flag       0
Temperature        0
Fuel_Price         0
CPI                0
Unemployment       0
Holiday_Name    5535
dtype: int64


In [4]:
#Fill NaN values and drop 'Unnamed:0' column
store_data.fillna(0, inplace=True)
store_data.drop(columns = ['Unnamed: 0'], inplace=True)
store_data.head(30)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Name
0,1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,0
1,1,2010-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,Valentine's Day
2,1,2010-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,0
3,1,2010-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,0
4,1,2010-03-05,1554806.68,0,46.5,2.625,211.350143,8.106,0
5,1,2010-03-12,1439541.59,0,57.79,2.667,211.380643,8.106,0
6,1,2010-03-19,1472515.79,0,54.58,2.72,211.215635,8.106,0
7,1,2010-03-26,1404429.92,0,51.45,2.732,211.018042,8.106,0
8,1,2010-04-02,1594968.28,0,62.27,2.719,210.82045,7.808,Easter
9,1,2010-04-09,1545418.53,0,65.86,2.77,210.622857,7.808,0


In [5]:
# List of holidays as observed by Walmart
holidays = [
    ["2010-02-05", "Super Bowl"],
    ["2010-02-12", "Valentine's Day"],
    ["2010-04-02", "Easter"],
    ["2010-07-02", "Independence Day"],
    ["2010-09-10", "Labor Day"],
    ["2010-10-29", "Halloween"],
    ["2010-11-26", "Thanksgiving Day"],
    ["2010-12-24", "Christmas Day"],
    ["2011-12-31", "New Year's Day"],
    ["2011-02-04", "Super Bowl"],
    ["2011-02-11", "Valentine's Day"],
    ["2011-04-22", "Easter"],
    ["2011-07-01", "Independence Day"],
    ["2011-09-09", "Labor Day"],
    ["2011-10-28", "Halloween"],
    ["2011-11-25", "Thanksgiving Day"],
    ["2011-12-23", "Christmas Day"],
    ["2012-12-30", "New Year's Day"],
    ["2012-02-03", "Super Bowl"],
    ["2012-02-17", "Valentine's Day"],
    ["2012-04-06", "Easter"],
    ["2012-07-06", "Independence Day"],
    ["2012-09-07", "Labor Day"]
]

In [6]:
# Function to convert "Date" colum to datetime without timestamp
def clean_and_convert_date(df, date_column_name):
    #Convert the date column to datetime
    df[date_column_name] = pd.to_datetime(df[date_column_name], format='%Y-%m-%d', errors='coerce')
    
    return df

store_data = clean_and_convert_date(store_data, 'Date')

In [7]:
# Initialize 'Holiday_Name' column
store_data['Holiday_Name'] = None

In [8]:
# Function to get Holiday_Name
def get_holiday_name(date):
    for holiday_date, holiday_name in holidays:
        if date == pd.to_datetime(holiday_date, format='%Y-%m-%d').date():
            return holiday_name
    return None

In [9]:
# Function to populate Holiday_Name and resulting Holiday_Flag
store_data['Holiday_Name'] = store_data['Date'].apply(get_holiday_name)
store_data['Holiday_Flag'] = store_data['Holiday_Name'].apply(lambda x: 1 if x is not None else 0)
store_data.head(15)

  if date == pd.to_datetime(holiday_date, format='%Y-%m-%d').date():


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Name
0,1,2010-02-05,1643690.9,1,42.31,2.572,211.096358,8.106,Super Bowl
1,1,2010-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,Valentine's Day
2,1,2010-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,
3,1,2010-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,
4,1,2010-03-05,1554806.68,0,46.5,2.625,211.350143,8.106,
5,1,2010-03-12,1439541.59,0,57.79,2.667,211.380643,8.106,
6,1,2010-03-19,1472515.79,0,54.58,2.72,211.215635,8.106,
7,1,2010-03-26,1404429.92,0,51.45,2.732,211.018042,8.106,
8,1,2010-04-02,1594968.28,1,62.27,2.719,210.82045,7.808,Easter
9,1,2010-04-09,1545418.53,0,65.86,2.77,210.622857,7.808,


In [10]:
# Check data types
store_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         6435 non-null   int64         
 1   Date          6435 non-null   datetime64[ns]
 2   Weekly_Sales  6435 non-null   float64       
 3   Holiday_Flag  6435 non-null   int64         
 4   Temperature   6435 non-null   float64       
 5   Fuel_Price    6435 non-null   float64       
 6   CPI           6435 non-null   float64       
 7   Unemployment  6435 non-null   float64       
 8   Holiday_Name  945 non-null    object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(1)
memory usage: 452.6+ KB


In [11]:
X = store_data.copy()
X.drop('Weekly_Sales', axis=1, inplace=True)
X.head()

Unnamed: 0,Store,Date,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Name
0,1,2010-02-05,1,42.31,2.572,211.096358,8.106,Super Bowl
1,1,2010-02-12,1,38.51,2.548,211.24217,8.106,Valentine's Day
2,1,2010-02-19,0,39.93,2.514,211.289143,8.106,
3,1,2010-02-26,0,46.63,2.561,211.319643,8.106,
4,1,2010-03-05,0,46.5,2.625,211.350143,8.106,


In [12]:
# Define target vector
y = store_data["Weekly_Sales"].ravel()
y[:5]

array([1643690.9 , 1641957.44, 1611968.17, 1409727.59, 1554806.68])

In [13]:
X = pd.get_dummies(X, columns=['Holiday_Name'])

In [47]:
# Define numeric features and feature names
numeric_features = ['Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']

In [48]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [50]:
# Create a ColumnTransformer to scale numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('passthrough', 'passthrough', ['Date']),  
    ],
    remainder='drop'  # Drop columns not specified above
)

In [51]:
# Create a pipeline to perform preprocessing
pipeline = Pipeline([
    ('preprocessor', preprocessor),
])

In [52]:
# Define a FunctionTransformer to convert 'Date' column to a NumPy array
date_converter = FunctionTransformer(lambda X: X.to_numpy().reshape(-1, 1), validate=False)

In [53]:
# Fit and transform the training data
X_train_scaled = pipeline.fit_transform(X_train)
X_test_scaled = pipeline.transform(X_test)

DTypePromotionError: The DType <class 'numpy.dtypes.Float64DType'> could not be promoted by <class 'numpy.dtypes.DateTime64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>)

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Visualize the features by importance
importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)