In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [47]:
# Load the datasets
train_df = pd.read_csv('train.csv', header = 0)
test_df = pd.read_csv('test.csv', header = 0)

In [48]:
train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [49]:
test_df.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [50]:
# Convert 'date' columns to datetime
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

In [51]:
# Encode the 'family' categorical data
label_encoder = LabelEncoder()
train_df['family_encoded'] = label_encoder.fit_transform(train_df['family'])
test_df['family_encoded'] = label_encoder.transform(test_df['family'])

# Normalize the 'sales' and 'onpromotion' features
sales_scaler = MinMaxScaler()
onpromotion_scaler = MinMaxScaler()
#train_df['sales'] = sales_scaler.fit_transform(train_df[['sales']])
#train_df['onpromotion'] = onpromotion_scaler.fit_transform(train_df[['onpromotion']])
#test_df['onpromotion'] = onpromotion_scaler.transform(test_df[['onpromotion']])

In [55]:
train_df.tail()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,family_encoded
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,28
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,29
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,30
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.0,8,31
3000887,3000887,2017-08-15,9,SEAFOOD,16.0,0,32


In [62]:
# Aggregate data to daily sales per family
daily_train_df = train_df.groupby(['date', 'store_nbr', 'family_encoded']).agg({
    'sales': 'sum',
    'onpromotion': 'mean'
}).reset_index()
daily_test_df = test_df.groupby(['date', 'family_encoded']).agg({
    'onpromotion': 'mean'
}).reset_index()

In [63]:
daily_train_df.head()

Unnamed: 0,date,store_nbr,family_encoded,sales,onpromotion
0,2013-01-01,1,0,0.0,0.0
1,2013-01-01,1,1,0.0,0.0
2,2013-01-01,1,2,0.0,0.0
3,2013-01-01,1,3,0.0,0.0
4,2013-01-01,1,4,0.0,0.0


In [64]:
len(daily_train_df)

3000888

In [65]:
daily_train_df = daily_train_df[daily_train_df["sales"] != 0]

# Display the DataFrame after removing zero sales rows
daily_train_df.head()

Unnamed: 0,date,store_nbr,family_encoded,sales,onpromotion
794,2013-01-01,25,2,2.0,0.0
795,2013-01-01,25,3,810.0,0.0
797,2013-01-01,25,5,180.589,0.0
799,2013-01-01,25,7,186.0,0.0
800,2013-01-01,25,8,143.0,0.0


In [66]:
len(daily_train_df)

2061758

In [67]:
# Features and target for the model
features = daily_train_df[['store_nbr', 'family_encoded', 'onpromotion']]
target = daily_train_df['sales']

In [68]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

In [69]:

len(X_val)

412352

In [70]:
X_train.head()

Unnamed: 0,store_nbr,family_encoded,onpromotion
218368,30,7,0.0
2984528,45,8,26.0
833653,45,7,0.0
2722691,48,26,1.0
1327900,10,13,0.0


In [37]:
"""# Keep the 'date' column separately for error calculation
train_dates = X_train['date']
val_dates = X_val['date']"""

"# Keep the 'date' column separately for error calculation\ntrain_dates = X_train['date']\nval_dates = X_val['date']"

In [None]:
"""# Drop the 'date' column from features for model training
X_train = X_train.drop(columns=['date'])
X_val = X_val.drop(columns=['date'])"""

In [71]:
# Initialize and train the RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)




In [72]:
# Predict on the validation set
val_predictions = rf_model.predict(X_val)

# Calculate MSE
val_mse = mean_squared_error(y_val, val_predictions)

# Calculate RMSE
val_rmse = np.sqrt(val_mse)

# Calculate MAE
val_mae = mean_absolute_error(y_val, val_predictions)

# Print the results
print("Validation MSE:", val_mse)
print("Validation RMSE:", val_rmse)
print("Validation MAE:", val_mae)

Validation MSE: 275498.3743428305
Validation RMSE: 524.8793902820252
Validation MAE: 153.65243551861704


In [74]:
print('The model score is: ',rf_model.score(X_val,y_val))

The model score is:  0.8346827156181701


In [None]:
# Cross-validate the model
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)

# Make predictions on the test set
test_features = daily_test_df[['family_encoded',  'onpromotion']]
test_predictions = rf_model.predict(test_features)

# Reverse the normalization to get actual sales predictions
predicted_sales = sales_scaler.inverse_transform(test_predictions.reshape(-1, 1))
daily_test_df['predicted_sales'] = predicted_sales

# Output results
print("CV RMSE Scores:", cv_rmse_scores)
print("Average CV RMSE:", cv_rmse_scores.mean())
print(daily_test_df.head())

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for RandomForestRegressor
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4]

}

# Initialize RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best parameters to train the RandomForestRegressor
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

# Cross-validate the model with best parameters
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)

# Make predictions on the test set with the best model
test_predictions = best_rf_model.predict(test_features)

# Reverse the normalization to get actual sales predictions
predicted_sales = sales_scaler.inverse_transform(test_predictions.reshape(-1, 1))
daily_test_df['predicted_sales'] = predicted_sales

# Output results
print("CV RMSE Scores:", cv_rmse_scores)
print("Average CV RMSE:", cv_rmse_scores.mean())
print(daily_test_df.head())

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
CV RMSE Scores: [0.10349798 0.10836597 0.10337959 0.10413689 0.10725884]
Average CV RMSE: 0.10532785626915164
        date  family_encoded  onpromotion  predicted_sales
0 2017-08-16               0     0.000000       305.057530
1 2017-08-16               1     0.000000         5.711158
2 2017-08-16               2     0.002699     26337.172191
3 2017-08-16               3     0.042635    157951.112463
4 2017-08-16               4     0.000000         3.935230
