In [None]:
%pip install tensorflow
%pip install tensorflow.keras.models
%pip install tensorflow.keras.layers
%pip install scikeras

In [2]:
import matplotlib.pyplot as plt, numpy as np, pandas as pd, seaborn as sb

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.preprocessing import LabelEncoder, StandardScaler

# NN imports
from sklearn.model_selection import GridSearchCV, cross_val_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor

# ensemble 
from sklearn.ensemble import BaggingRegressor

In [None]:
# import datasets and transformations (reusability purposes)
from IPython import get_ipython

get_ipython().run_line_magic('run', 'datasets.ipynb')

In [4]:
# # COULD BE USED FOR FEATURE ENGINEERING

# # Ensure 'OrderDate' and 'Ship_by_Date' are in datetime format
# merged_data['OrderDate'] = pd.to_datetime(merged_data['OrderDate'])
# merged_data['Ship_by_Date'] = pd.to_datetime(merged_data['Ship_by_Date'])

# # 'Lead_Time' in days
# merged_data['Lead_Time'] = (merged_data['Ship_by_Date'] - merged_data['OrderDate']).dt.days
# # missing values in 'Lead_Time'
# merged_data['Lead_Time'].fillna(merged_data['Lead_Time'].median(), inplace=True)

### Evaluation and paramgrids

### Make Model

In [None]:
import sys
sys.path.append('../src')
from model_utils import * # evaluate_model, param grids and hyperparm tuning

# define the features and target variable from 'product_sales'
X = product_sales[['ProductNumber', 'order_month']]
y = product_sales['OrderQuantity']

# convert 'ProductNumber' to numeric values
label_encoder = LabelEncoder()
X['ProductNumber'] = label_encoder.fit_transform(X['ProductNumber'])

# normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# train-test split with scaled features
X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# train the model with scaled features (Random Forest Regressor)
rf = RandomForestRegressor()
rf_params = find_best_hyperparameters(rf, param_grids(rf.__class__.__name__), X_train_scaled, y_train)

In [10]:
rf = RandomForestRegressor(**rf_params)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_val_scaled)

In [None]:
evaluate_model(rf, X_scaled, y)
print('--------------------------------------------')

In [12]:
# # train the model with scaled features (Decision Tree Regressor)
# dt = DecisionTreeRegressor()
# dt_params = find_best_hyperparameters(dt, param_grids(dt), X_train_scaled, y_train)

In [13]:
# dt = DecisionTreeRegressor(**dt_params)
# dt.fit(X_train_scaled, y_train)
# y_pred_dt = dt.predict(X_val_scaled)

In [14]:
# evaluate_model(dt, X_scaled, y)
# print('--------------------------------------------')

In [15]:
# # train the model with scaled features (Linear Regression)
# lr = LinearRegression()
# lr_params = find_best_hyperparameters(lr, param_grids(lr), X_train_scaled, y_train)

In [16]:
# lr = LinearRegression(**lr_params)
# lr.fit(X_train_scaled, y_train)
# y_pred_lr = lr.predict(X_val_scaled)

In [17]:
# evaluate_model(lr, X_scaled, y)
# print('--------------------------------------------')

In [None]:
# NN METRICS ARE NOT ARENT BETTER THAN BASE MODELS, SO ITS COMMENTED OUT

# Define the neural network model
def create_nn_model(input_shape):
    model = Sequential()
    model.add(Dense(64, input_dim=input_shape, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

# Wrap the Keras model with KerasRegressor
input_shape = X_train_scaled.shape[1]
nn_model = KerasRegressor(build_fn=create_nn_model, input_shape=input_shape, verbose=1)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=nn_model, param_grid=param_grids(nn_model.__class__.__name__), cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
nn_params = grid_search.best_params_
print(f'Best Parameters: {nn_params}')

# Set the best parameters to the model
nn_model.set_params(**nn_params)

# Use cross_val_score or other scikit-learn utilities
scores = cross_val_score(nn_model, X_train_scaled, y_train, cv=5)
print(scores)

# Train the model
history = nn_model.fit(X_train_scaled, y_train, epochs=nn_params['epochs'], batch_size=nn_params['batch_size'], validation_split=0.2, verbose=1)

# Predict using the neural network model
y_pred_nn = nn_model.predict(X_val_scaled)

# Evaluate the model
evaluate_model(nn_model, X_scaled, y)

In [None]:
evaluate_model(nn_model, X_scaled, y)
print('--------------------------------------------')

In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))

# Plot the bar plot to compare the actual and predicted values
axes[0].bar(product_sales.loc[y_val.index, 'ProductNumber'], y_val, label='Actual', alpha=0.6)
axes[0].bar(product_sales.loc[y_val.index, 'ProductNumber'], y_pred_rf, label='Predicted', alpha=0.6)
axes[0].set_xlabel('Product Number')
axes[0].set_ylabel('Order Quantity')
axes[0].set_title('Actual vs Predicted Order Quantity')
axes[0].set_ylim(0, 2250000)
axes[0].legend()

# Plot the residual plot
residuals = y_val - y_pred_rf
axes[1].scatter(product_sales.loc[y_val.index, 'ProductNumber'], residuals, alpha=0.6)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Product Number')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')

# Show the plots
plt.tight_layout()
plt.show()

### Ensemble Methods

In [None]:
bagging_rf = BaggingRegressor(estimator=RandomForestRegressor(**rf_params), n_estimators=10, random_state=42)
bagging_rf.fit(X_train_scaled, y_train)
y_pred_bagging_rf = bagging_rf.predict(X_val_scaled)

evaluate_model(bagging_rf, X_scaled, y)
print('--------------------------------------------')

In [32]:
# bagging_nn = BaggingRegressor(estimator=KerasRegressor(build_fn=create_nn_model, input_shape=input_shape, **nn_params), n_estimators=10, random_state=42)
# bagging_nn.fit(X_train_scaled, y_train)
# y_pred_bagging_nn = bagging_nn.predict(X_val_scaled)

# evaluate_model(bagging_nn, X_scaled, y)
# print('--------------------------------------------')

NN Bagging: 
Mean Absolute Error (MAE): 145788.9249
Mean Squared Error (MSE): 117073044061.8419
Root Mean Squared Error (RMSE): 342159.3840
R-squared (R²): 0.0013
--------------------------------------------

In [None]:
from sklearn.ensemble import VotingRegressor

rf_nn_voting = VotingRegressor(estimators=[('rf', RandomForestRegressor(**rf_params)), ('nn', KerasRegressor(build_fn=create_nn_model, input_shape=input_shape, **nn_params))])
rf_nn_voting.fit(X_train_scaled, y_train)
y_pred_rf_nn_voting = rf_nn_voting.predict(X_val_scaled)

evaluate_model(rf_nn_voting, X_scaled, y)
print('--------------------------------------------')

# Voting Ensemble

Mean Absolute Error (MAE): 134711.8210
Mean Squared Error (MSE): 116899234027.1837
Root Mean Squared Error (RMSE): 341905.2998
R-squared (R²): 0.0028
--------------------------------------------

In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))

# Plot the bar plot to compare the actual and predicted values
axes[0].bar(product_sales.loc[y_val.index, 'ProductNumber'], y_val, label='Actual', alpha=0.6)
axes[0].bar(product_sales.loc[y_val.index, 'ProductNumber'], y_pred_rf_nn_voting, label='Predicted', alpha=0.6)
axes[0].set_xlabel('Product Number')
axes[0].set_ylabel('Order Quantity')
axes[0].set_title('Actual vs Predicted Order Quantity')
axes[0].set_ylim(0, 2250000)
axes[0].legend()

# Plot the residual plot
residuals = y_val - y_pred_rf_nn_voting
axes[1].scatter(product_sales.loc[y_val.index, 'ProductNumber'], residuals, alpha=0.6)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Product Number')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')

# Show the plots
plt.tight_layout()
plt.show()