In [1]:
# Load the necessary libraries and dataset
import pandas as pd
sales_data = pd.read_csv("/Users/zone/Desktop/Week2/Day 5/IronKaggle/sales.csv")

# DATA CLEANING

# Convert 'date' column to datetime
sales_data['date'] = pd.to_datetime(sales_data['date'])

# One-hot encoding of 'state_holiday'
sales_data_encoded = pd.get_dummies(sales_data, columns=['state_holiday'], drop_first=True)

# Drop 'Unnamed: 0' column
sales_data_cleaned = sales_data_encoded.drop(columns=['Unnamed: 0'])

#FEATURE ENGINEERING

# Extract additional features from the 'date' column
sales_data_cleaned['year'] = sales_data_cleaned['date'].dt.year
sales_data_cleaned['month'] = sales_data_cleaned['date'].dt.month
sales_data_cleaned['day'] = sales_data_cleaned['date'].dt.day
sales_data_cleaned['week_of_year'] = sales_data_cleaned['date'].dt.isocalendar().week



In [2]:
# ADITIONAL PREPROCESSING STEPS

# Check for outliers 

from scipy import stats

# Calculate the Z-score for 'sales' and 'nb_customers_on_day'
sales_z_scores = stats.zscore(sales_data_cleaned['sales'])
customers_z_scores = stats.zscore(sales_data_cleaned['nb_customers_on_day'])

# Identify outliers (Z-score > 3 or Z-score < -3)
outliers_sales = (abs(sales_z_scores) > 3)
outliers_customers = (abs(customers_z_scores) > 3)

# Remove the outliers from the dataset
sales_data_no_z_outliers = sales_data_cleaned[~(outliers_sales | outliers_customers)]

# Display the size of the dataset after removing outliers
sales_data_no_z_outliers.info(), sales_data_no_z_outliers.head()


<class 'pandas.core.frame.DataFrame'>
Index: 629897 entries, 0 to 640839
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   store_ID             629897 non-null  int64         
 1   day_of_week          629897 non-null  int64         
 2   date                 629897 non-null  datetime64[ns]
 3   nb_customers_on_day  629897 non-null  int64         
 4   open                 629897 non-null  int64         
 5   promotion            629897 non-null  int64         
 6   school_holiday       629897 non-null  int64         
 7   sales                629897 non-null  int64         
 8   state_holiday_a      629897 non-null  bool          
 9   state_holiday_b      629897 non-null  bool          
 10  state_holiday_c      629897 non-null  bool          
 11  year                 629897 non-null  int32         
 12  month                629897 non-null  int32         
 13  day                

(None,
    store_ID  day_of_week       date  nb_customers_on_day  open  promotion  \
 0       366            4 2013-04-18                  517     1          0   
 1       394            6 2015-04-11                  694     1          0   
 2       807            4 2013-08-29                  970     1          1   
 3       802            2 2013-05-28                  473     1          1   
 4       726            4 2013-10-10                 1068     1          1   
 
    school_holiday  sales  state_holiday_a  state_holiday_b  state_holiday_c  \
 0               0   4422            False            False            False   
 1               0   8297            False            False            False   
 2               0   9729            False            False            False   
 3               0   6513            False            False            False   
 4               0  10882            False            False            False   
 
    year  month  day  week_of_year  
 0  

In [3]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = sales_data_no_z_outliers.drop(columns=['sales', 'date'])
y = sales_data_no_z_outliers['sales']

# Perform an 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the size of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((503917, 13), (125980, 13), (503917,), (125980,))

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Train the model on the training data
linear_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = linear_model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


(1673060.7451727511, 0.8649088693573258)

In [5]:
# Random Forest

from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the Random Forest model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model's performance
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mse_rf, r2_rf


(695351.1767549412, 0.9438539354099544)

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42),
                                   param_distributions=param_grid,
                                   n_iter=10,  # Number of combinations to try
                                   cv=3,       # 3-fold cross-validation
                                   random_state=42, n_jobs=-1)

# Fit the model
random_search.fit(X_train, y_train)  # This step is crucial

# Retrieve the best hyperparameters and evaluate performance
best_rf_model = random_search.best_estimator_  # Now this works after fitting
y_pred_rf_tuned = best_rf_model.predict(X_test)

# Calculate MSE and R²
mse_rf_tuned = mean_squared_error(y_test, y_pred_rf_tuned)
r2_rf_tuned = r2_score(y_test, y_pred_rf_tuned)

# Display the best parameters and model performance
print("Best Parameters: ", random_search.best_params_)
print(f"MSE: {mse_rf_tuned}, R²: {r2_rf_tuned}")




9 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamet

Best Parameters:  {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30}
MSE: 1305008.891728689, R²: 0.8946271812359279


In [8]:
# Load the real-life dataset 
real_life_data = pd.read_csv("/Users/zone/Desktop/Week2/Day 5/IronKaggle/REAL_DATA.csv")

# Preprocess the data (apply the same steps as the training set)
real_life_data['date'] = pd.to_datetime(real_life_data['date'], dayfirst=True)
real_life_data_encoded = pd.get_dummies(real_life_data, columns=['state_holiday'], drop_first=True)

# Align the real-life dataset with the training dataset columns

# Make sure the real-life dataset has the same columns as in the training set
missing_cols = set(X_train.columns) - set(real_life_data_encoded.columns)
for col in missing_cols:
    real_life_data_encoded[col] = 0  # Add missing columns with default values

# Ensure the order of columns is the same as in the training set
real_life_data_encoded = real_life_data_encoded[X_train.columns]

# Make predictions using the trained model
predicted_sales = best_rf_model.predict(real_life_data_encoded)

# Add predictions to the dataset and save to CSV
real_life_data['sales'] = predicted_sales
real_life_data.to_csv('predicted_sales_real_data.csv', index=False)

print("Predictions saved to 'predicted_sales_real_data.csv'")

Predictions saved to 'predicted_sales_real_data.csv'
