In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import logging

# Configure inline plots for Jupyter Notebook
%matplotlib inline

In [15]:
import logging
import os
import pandas as pd
import datetime

# Set base path for data
base_path = r"C:\Users\neba\Downloads\Compressed\rossmann-store-sales"

# Create necessary directories
log_dir = "logs"
model_dir = "models"
os.makedirs(log_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

# Configure logging
logging.basicConfig(
    filename=os.path.join(log_dir, "model_training.log"),
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger()
logger.info("Logging setup complete for model training.")

try:
    # Load datasets
    logger.info("Loading datasets.")
    train_df = pd.read_csv(os.path.join(base_path, "train.csv"))
    test_df = pd.read_csv(os.path.join(base_path, "test.csv"))
    store_df = pd.read_csv(os.path.join(base_path, "store.csv"))
    logger.info("Datasets loaded successfully.")
except Exception as e:
    logger.error(f"An error occurred while loading the datasets: {e}", exc_info=True)


  train_df = pd.read_csv(os.path.join(base_path, "train.csv"))


In [20]:
# Summarize train dataset
print("Train Dataset Summary")
print(train_df.info())  # Overview of columns, data types, and non-null counts
print(train_df.describe(include='all'))  # Summary statistics for numeric and non-numeric columns

# Summarize store dataset
print("\nStore Dataset Summary")
print(store_df.info())
print(store_df.describe(include='all'))


Train Dataset Summary
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 13 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   Store          1017209 non-null  int64         
 1   DayOfWeek      1017209 non-null  int64         
 2   Date           1017209 non-null  datetime64[ns]
 3   Sales          1017209 non-null  int64         
 4   Customers      1017209 non-null  int64         
 5   Open           1017209 non-null  int64         
 6   Promo          1017209 non-null  int64         
 7   StateHoliday   1017209 non-null  object        
 8   SchoolHoliday  1017209 non-null  int64         
 9   Year           1017209 non-null  int32         
 10  Month          1017209 non-null  int32         
 11  Week           1017209 non-null  UInt32        
 12  Day            1017209 non-null  int32         
dtypes: UInt32(1), datetime64[ns](1), int32(3), int64(7), object(1)
me

In [21]:
# Missing values in train dataset
print("Missing Values in Train Dataset")
print(train_df.isnull().sum())

# Missing values in store dataset
print("\nMissing Values in Store Dataset")
print(store_df.isnull().sum())


Missing Values in Train Dataset
Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
Year             0
Month            0
Week             0
Day              0
dtype: int64

Missing Values in Store Dataset
Store                        0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Promo_Jan                    0
Promo_Feb                    0
Promo_Mar                    0
Promo_Apr                    0
Promo_May                    0
Promo_Jun                    0
Promo_Jul                    0
Promo_Aug                    0
Promo_Sep                    0
Promo_Oct                    0
Promo_Nov                    0
Promo_Dec               

In [22]:
# Check data types and unique values
print("Train Data Types and Unique Values")
print(train_df.dtypes)
print("\nStore Data Types and Unique Values")
print(store_df.dtypes)


Train Data Types and Unique Values
Store                     int64
DayOfWeek                 int64
Date             datetime64[ns]
Sales                     int64
Customers                 int64
Open                      int64
Promo                     int64
StateHoliday             object
SchoolHoliday             int64
Year                      int32
Month                     int32
Week                     UInt32
Day                       int32
dtype: object

Store Data Types and Unique Values
Store                          int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
Promo_Jan                      int64
Promo_Feb                      int64
Promo_Mar                      int64
Promo_Apr         

In [23]:
# Check value counts for categorical columns in train dataset
categorical_columns_train = train_df.select_dtypes(include=['object']).columns
for col in categorical_columns_train:
    print(f"Value counts for {col} in train dataset:")
    print(train_df[col].value_counts())
    print()

# Check value counts for categorical columns in store dataset
categorical_columns_store = store_df.select_dtypes(include=['object']).columns
for col in categorical_columns_store:
    print(f"Value counts for {col} in store dataset:")
    print(store_df[col].value_counts())
    print()


Value counts for StateHoliday in train dataset:
StateHoliday
0    855087
0    131072
a     20260
b      6690
c      4100
Name: count, dtype: int64

Value counts for StoreType in store dataset:
StoreType
a    602
d    348
c    148
b     17
Name: count, dtype: int64

Value counts for Assortment in store dataset:
Assortment
a    593
c    513
b      9
Name: count, dtype: int64

Value counts for PromoInterval in store dataset:
PromoInterval
None                544
Jan,Apr,Jul,Oct     335
Feb,May,Aug,Nov     130
Mar,Jun,Sept,Dec    106
Name: count, dtype: int64



Data Preprocessing

Because the StateHoliday column contains mixed data types (both strings and integers). To fix this, you need to standardize the column to have a uniform data type (e.g., convert everything to strings).

In [25]:
# Convert all values in 'StateHoliday' to strings
train_df['StateHoliday'] = train_df['StateHoliday'].astype(str)

# Apply LabelEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_df['StateHoliday'] = label_encoder.fit_transform(train_df['StateHoliday'])


One-Hot Encode Categorical Features
One-hot encode categorical columns like StoreType, Assortment, and PromoInterval in store_df.

In [26]:
store_df = pd.get_dummies(store_df, columns=['StoreType', 'Assortment', 'PromoInterval'], drop_first=True)


In [27]:
# Extract date features in train_df
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Week'] = train_df['Date'].dt.isocalendar().week
train_df['Day'] = train_df['Date'].dt.day


In [29]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Scale numerical features in train_df
num_features_train = ['Sales', 'Customers']
train_df[num_features_train] = scaler.fit_transform(train_df[num_features_train])


In [30]:
# Columns to scale in store_df
num_features_store = ['CompetitionDistance']

# Apply scaling to store_df
store_df[num_features_store] = scaler.fit_transform(store_df[num_features_store])


In [31]:
# Merge train_df and store_df
merged_df = train_df.merge(store_df, on='Store', how='left')

In [32]:
# Check for missing values
print(merged_df.isnull().sum())

# Validate data types
print(merged_df.dtypes)

Store                             0
DayOfWeek                         0
Date                              0
Sales                             0
Customers                         0
Open                              0
Promo                             0
StateHoliday                      0
SchoolHoliday                     0
Year                              0
Month                             0
Week                              0
Day                               0
CompetitionDistance               0
CompetitionOpenSinceMonth         0
CompetitionOpenSinceYear          0
Promo2                            0
Promo2SinceWeek                   0
Promo2SinceYear                   0
Promo_Jan                         0
Promo_Feb                         0
Promo_Mar                         0
Promo_Apr                         0
Promo_May                         0
Promo_Jun                         0
Promo_Jul                         0
Promo_Aug                         0
Promo_Sep                   

Key Observations
No Missing Values:

All columns have 0 missing values, meaning the data is clean and ready for modeling.
Data Types:

Numeric columns like Sales, Customers, and CompetitionDistance are correctly set as float64 or int64.
Categorical columns like StateHoliday have been encoded as int64.
Boolean columns (e.g., StoreType_b, PromoInterval_Jan,Apr,Jul,Oct) are appropriately bool.
Date Column:

The Date column is correctly stored as datetime64[ns]

Feature Engineering
New Features to Add
IsWeekend: Whether the day is a weekend (Saturday or Sunday).
Quarter: The quarter of the year (1, 2, 3, or 4).
IsHolidaySeason: Whether the date falls in a holiday season (November or December).

In [33]:
merged_df['IsWeekend'] = merged_df['DayOfWeek'].isin([6, 7]).astype(int)
merged_df['Quarter'] = merged_df['Month'].apply(lambda x: (x - 1) // 3 + 1)
merged_df['IsHolidaySeason'] = merged_df['Month'].isin([11, 12]).astype(int)


In [35]:
from sklearn.model_selection import train_test_split

# Define features and target
X = merged_df.drop(columns=['Sales', 'Date'])
y = merged_df['Sales']

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define the pipeline
rf_pipeline = Pipeline([
    ('model', RandomForestRegressor(random_state=42, n_estimators=100))
])

# Train the model
rf_pipeline.fit(X_train, y_train)

# Predictions
y_pred = rf_pipeline.predict(X_test)

# Evaluation
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest Regressor Performance:")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R² Score: {r2}")


TypeError: got an unexpected keyword argument 'squared'

In [37]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Evaluation
mse = mean_squared_error(y_test, y_pred)  # MSE
rmse = np.sqrt(mse)  # RMSE
mae = mean_absolute_error(y_test, y_pred)  # MAE
r2 = r2_score(y_test, y_pred)  # R²

print(f"Random Forest Regressor Performance:")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R² Score: {r2}")


Random Forest Regressor Performance:
RMSE: 0.11456543194228826
MAE: 0.06851764958761702
R² Score: 0.9868453666768269


In [38]:
# Add lagged sales as features
for lag in range(1, 7):  # Lags for the past 6 weeks
    merged_df[f'Lag_{lag}'] = merged_df['Sales'].shift(lag * 7)

# Drop rows with NaN (due to lagging)
merged_df.dropna(inplace=True)

# Re-split the data
X = merged_df.drop(columns=['Sales', 'Date'])
y = merged_df['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [40]:
# Print the names of the pipeline steps
print(rf_pipeline.named_steps)

{'model': RandomForestRegressor(random_state=42)}


In [43]:
# Access the feature importances
feature_importance = rf_pipeline.named_steps['model'].feature_importances_


In [45]:
print(rf_pipeline.named_steps)

{'model': RandomForestRegressor(random_state=42)}


In [46]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Example preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['numeric_column1', 'numeric_column2']),
        ('cat', OneHotEncoder(), ['categorical_column'])
    ]
)


In [47]:
from sklearn.pipeline import Pipeline

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])
