In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/processed/sales_data_processed.csv')

In [None]:
df.groupby('Date')['Sales'].sum().plot(figsize=(15, 5))
plt.title('Total Sales Over Time')
plt.ylabel('Sales')
plt.show()

In [None]:
store_types = [col for col in df.columns if 'StoreType_' in col]
df.groupby(store_types)['Sales'].mean().plot(kind='bar')
plt.title('Average Sales by Store Type')
plt.ylabel('Sales')
plt.show()

In [None]:
import pandas as pd
df = pd.read_csv('../data/processed/sales_data_processed.csv')
print(df.head())

In [None]:
print(df.dtypes)


In [None]:
print(df.isnull().sum())


In [None]:
print(df.describe())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(df['Sales'], kde=True)
plt.title('Distribution of Sales')
plt.show()


In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
df['Sales'].plot(figsize=(15, 5))
plt.title('Sales Over Time')
plt.ylabel('Sales')
plt.show()


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(df['Sales'], model='additive', period=365)
result.plot()
plt.show()


In [None]:
sns.boxplot(x='DayOfWeek', y='Sales', data=df.reset_index())
plt.title('Sales by Day of Week')
plt.show()


In [None]:
sns.boxplot(x='StoreType', y='Sales', data=df.reset_index())
plt.title('Sales by Store Type')
plt.show()


In [None]:
sns.boxplot(x='StateHoliday', y='Sales', data=df.reset_index())
plt.title('Sales on State Holidays')
plt.show()


In [None]:
corr = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
sns.scatterplot(x='Customers', y='Sales', data=df.reset_index())
plt.title('Sales vs. Customers')
plt.show()


In [None]:
sns.scatterplot(x='CompetitionDistance', y='Sales', data=df.reset_index())
plt.title('Sales vs. Competition Distance')
plt.show()


In [None]:
pd.plotting.autocorrelation_plot(df['Sales'])
plt.title('Autocorrelation of Sales')
plt.show()


In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(df['Sales'], lags=30)
plt.title('Partial Autocorrelation of Sales')
plt.show()


In [None]:
for lag in range(1, 8):
    df[f'Lag_{lag}'] = df['Sales'].shift(lag)
df.dropna(inplace=True)
lag_corr = df[[f'Lag_{lag}' for lag in range(1, 8)] + ['Sales']].corr()
print(lag_corr['Sales'])


In [None]:
sns.boxplot(y='Sales', data=df.reset_index())
plt.title('Boxplot of Sales')
plt.show()


In [2]:
# Import necessary libraries
import pandas as pd

# Load the processed data
df = pd.read_csv('../data/processed/sales_data_processed.csv')

# Ensure the 'Date' column is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Sort the data by 'Store' and 'Date' to maintain chronological order
df.sort_values(by=['Store', 'Date'], inplace=True)

# Create lag features for the previous 1, 2, and 3 days
df['Sales_Lag_1'] = df.groupby('Store')['Sales'].shift(1)
df['Sales_Lag_2'] = df.groupby('Store')['Sales'].shift(2)
df['Sales_Lag_3'] = df.groupby('Store')['Sales'].shift(3)

# Drop rows with missing values created by shifting
df.dropna(inplace=True)

# Save the new dataframe with lag features
df.to_csv('../data/processed/sales_data_with_lags.csv', index=False)

# Display the first few rows to verify
df.head()


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,...,StoreType_d,Assortment_b,Assortment_c,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,Sales_Lag_1,Sales_Lag_2,Sales_Lag_3
839886,1,5,2013-01-05,4997,635,1,0,1,1270.0,9.0,...,False,False,False,True,False,False,False,4486.0,4327.0,5530.0
838761,1,0,2013-01-07,7176,785,1,1,1,1270.0,9.0,...,False,False,False,True,False,False,False,4997.0,4486.0,4327.0
837656,1,1,2013-01-08,5580,654,1,1,1,1270.0,9.0,...,False,False,False,True,False,False,False,7176.0,4997.0,4486.0
836551,1,2,2013-01-09,5471,626,1,1,1,1270.0,9.0,...,False,False,False,True,False,False,False,5580.0,7176.0,4997.0
835446,1,3,2013-01-10,4892,615,1,1,1,1270.0,9.0,...,False,False,False,True,False,False,False,5471.0,5580.0,7176.0


In [3]:
# Select the features for modeling
features = [
    'Store', 'DayOfWeek', 'Promo', 'SchoolHoliday', 'CompetitionDistance',
    'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
    'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
    'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_b', 'Assortment_c',
    'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c',
    'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3'
]

# Define the target variable
target = 'Sales'

# Create the feature matrix X and target vector y
X = df[features]
y = df[target]


In [4]:
from sklearn.model_selection import train_test_split

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
import joblib

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, '../src/models/random_forest_model.pkl')


['../src/models/random_forest_model.pkl']

In [15]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

# Print the results
print('Mean Absolute Error (MAE):', mae)
print('Root Mean Squared Error (RMSE):', rmse)


Mean Absolute Error (MAE): 795.1872551560948
Root Mean Squared Error (RMSE): 1179.3752098443258


In [17]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Save the best model
joblib.dump(best_model, '../src/models/best_random_forest_model.pkl')

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = root_mean_squared_error(y_test, y_pred_best)

print('Best Model MAE:', mae_best)
print('Best Model RMSE:', rmse_best)


KeyboardInterrupt: 

In [20]:
# Get feature importances from the model
importances = model.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort the DataFrame by importance
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display the feature importances
print(feature_importance_df)

                      Feature  Importance
21                Sales_Lag_1    0.561619
22                Sales_Lag_2    0.107008
1                   DayOfWeek    0.077011
2                       Promo    0.069010
23                Sales_Lag_3    0.044708
12                        Day    0.027238
4         CompetitionDistance    0.024247
11                      Month    0.019135
0                       Store    0.017959
6    CompetitionOpenSinceYear    0.008629
5   CompetitionOpenSinceMonth    0.007374
8             Promo2SinceWeek    0.006974
15                StoreType_d    0.005015
9             Promo2SinceYear    0.004896
10                       Year    0.004160
14                StoreType_c    0.003942
3               SchoolHoliday    0.003236
17               Assortment_c    0.002980
13                StoreType_b    0.002409
18             StateHoliday_a    0.000789
7                      Promo2    0.000756
16               Assortment_b    0.000597
19             StateHoliday_b    0

In [21]:
# Ensure data is sorted by 'Store' and 'Date'
df.sort_values(by=['Store', 'Date'], inplace=True)

# Create rolling mean features for a 7-day window
df['Rolling_Mean_7'] = df.groupby('Store')['Sales'].transform(lambda x: x.shift(1).rolling(window=7).mean())

# Create rolling mean features for a 14-day window
df['Rolling_Mean_14'] = df.groupby('Store')['Sales'].transform(lambda x: x.shift(1).rolling(window=14).mean())

# Drop rows with missing values
df.dropna(inplace=True)

# Update the features list
features.extend(['Rolling_Mean_7', 'Rolling_Mean_14'])

# Update X and y
X = df[features]
y = df[target]


In [22]:
import numpy as np

# Add sine and cosine transformations to capture seasonality
df['DayOfYear'] = df['Date'].dt.dayofyear
df['Sin_DayOfYear'] = np.sin(2 * np.pi * df['DayOfYear'] / 365.25)
df['Cos_DayOfYear'] = np.cos(2 * np.pi * df['DayOfYear'] / 365.25)

# Update the features list
features.extend(['Sin_DayOfYear', 'Cos_DayOfYear'])

# Update X and y
X = df[features]
y = df[target]


In [23]:
# Interaction between Promo and SchoolHoliday
df['Promo_SchoolHoliday'] = df['Promo'] * df['SchoolHoliday']

# Update the features list
features.append('Promo_SchoolHoliday')

# Update X and y
X = df[features]
y = df[target]


In [24]:
import lightgbm as lgb

# Prepare the dataset for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define parameters
params = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'seed': 42
}

# Train the model
lgbm_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, test_data],
    early_stopping_rounds=50
)

# Save the model
lgbm_model.save_model('models/lightgbm_model.txt')

# Make predictions
y_pred_lgbm = lgbm_model.predict(X_test)

# Evaluate the model
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
rmse_lgbm = root_mean_squared_error(y_test, y_pred_lgbm)

print('LightGBM MAE:', mae_lgbm)
print('LightGBM RMSE:', rmse_lgbm)


TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

In [25]:
from prophet import Prophet

# Prepare the data
prophet_df = df[['Date', 'Sales']].rename(columns={'Date': 'ds', 'Sales': 'y'})

# Initialize the model
prophet_model = Prophet()

# Fit the model
prophet_model.fit(prophet_df)

# Make future dataframe
future = prophet_model.make_future_dataframe(periods=90)  # Predict 90 days into the future

# Predict
forecast = prophet_model.predict(future)

# Evaluate the model on the test set
test_df = prophet_df[prophet_df['ds'].isin(X_test['Date'])]
y_pred_prophet = forecast[forecast['ds'].isin(X_test['Date'])]['yhat']
y_test_prophet = test_df['y']

# Calculate metrics
mae_prophet = mean_absolute_error(y_test_prophet, y_pred_prophet)
rmse_prophet = root_mean_squared_error(y_test_prophet, y_pred_prophet)

print('Prophet MAE:', mae_prophet)
print('Prophet RMSE:', rmse_prophet)


  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


KeyboardInterrupt: 

In [27]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb

# Define the parameter grid
param_grid = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 500, 1000],
    'max_depth': [ -1, 10, 20],
    'min_child_samples': [20, 30, 50]
}

# Initialize the model
lgbm_regressor = lgb.LGBMRegressor(objective='regression', random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgbm_regressor,
    param_distributions=param_grid,
    n_iter=20,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Perform the search
random_search.fit(X_train, y_train)

# Best model
best_lgbm_model = random_search.best_estimator_

# Save the best model
best_lgbm_model.booster_.save_model('models/best_lightgbm_model.txt')

# Evaluate the best model
y_pred_best_lgbm = best_lgbm_model.predict(X_test)
mae_best_lgbm = mean_absolute_error(y_test, y_pred_best_lgbm)
rmse_best_lgbm = root_mean_squared_error(y_test, y_pred_best_lgbm)

print('Best LightGBM MAE:', mae_best_lgbm)
print('Best LightGBM RMSE:', rmse_best_lgbm)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 

In [28]:
# Add a 7-day lag to capture weekly seasonality
df['Sales_Lag_7'] = df.groupby('Store')['Sales'].shift(7)

# Add a 14-day lag to capture bi-weekly patterns
df['Sales_Lag_14'] = df.groupby('Store')['Sales'].shift(14)

# Update the features list
features.extend(['Sales_Lag_7', 'Sales_Lag_14'])

# Drop rows with missing values due to new lag features
df.dropna(inplace=True)

# Update X and y
X = df[features]
y = df[target]


In [29]:
# Add a 3-day rolling mean
df['Rolling_Mean_3'] = df.groupby('Store')['Sales'].transform(
    lambda x: x.shift(1).rolling(window=3).mean()
)

# Update the features list
features.append('Rolling_Mean_3')

# Drop rows with missing values
df.dropna(inplace=True)

# Update X and y
X = df[features]
y = df[target]


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print('Updated MAE:', mae)
print('Updated RMSE:', rmse)


Updated MAE: 648.0547438711358
Updated RMSE: 961.7145130523978


In [31]:
# Select the first 100 stores
sample_stores = df['Store'].unique()[:100]

# Filter the dataframe
df_sample = df[df['Store'].isin(sample_stores)]

# Proceed with df_sample instead of df
X = df_sample[features]
y = df_sample[target]


In [32]:
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)


                      Feature  Importance
25            Rolling_Mean_14    0.522605
21                Sales_Lag_1    0.158411
2                       Promo    0.113385
1                   DayOfWeek    0.056310
27              Cos_DayOfYear    0.020801
12                        Day    0.018809
31             Rolling_Mean_3    0.011992
24             Rolling_Mean_7    0.011691
29                Sales_Lag_7    0.009617
4         CompetitionDistance    0.009563
30               Sales_Lag_14    0.008993
22                Sales_Lag_2    0.007460
0                       Store    0.007336
26              Sin_DayOfYear    0.007137
23                Sales_Lag_3    0.006852
11                      Month    0.004406
6    CompetitionOpenSinceYear    0.004074
5   CompetitionOpenSinceMonth    0.003419
8             Promo2SinceWeek    0.003147
15                StoreType_d    0.003011
14                StoreType_c    0.002361
9             Promo2SinceYear    0.002338
17               Assortment_c    0

In [33]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(
    model, X, y, cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1
)
mean_mae = -scores.mean()
print('Cross-Validated MAE:', mean_mae)


Cross-Validated MAE: 796.0114150112211


In [34]:
# Add lag features up to 30 days
for lag in range(1, 31):
    df[f'Sales_Lag_{lag}'] = df.groupby('Store')['Sales'].shift(lag)

# Drop rows with missing values
df.dropna(inplace=True)

# Update the features list
lag_features = [f'Sales_Lag_{lag}' for lag in range(1, 31)]
features.extend(lag_features)

# Update X and y
X = df[features]
y = df[target]


In [35]:
# Rolling windows of 7, 14, 30 days
window_sizes = [7, 14, 30]
for window in window_sizes:
    df[f'Rolling_Mean_{window}'] = df.groupby('Store')['Sales'].transform(
        lambda x: x.shift(1).rolling(window=window).mean()
    )
    df[f'Rolling_Std_{window}'] = df.groupby('Store')['Sales'].transform(
        lambda x: x.shift(1).rolling(window=window).std()
    )

# Drop rows with missing values
df.dropna(inplace=True)

# Update the features list
for window in window_sizes:
    features.extend([f'Rolling_Mean_{window}', f'Rolling_Std_{window}'])

# Update X and y
X = df[features]
y = df[target]


In [36]:
# Add EWMA feature
df['EWMA_12'] = df.groupby('Store')['Sales'].transform(
    lambda x: x.shift(1).ewm(span=12, adjust=False).mean()
)

# Update the features list
features.append('EWMA_12')

# Drop rows with missing values
df.dropna(inplace=True)

# Update X and y
X = df[features]
y = df[target]


In [37]:
# Cumulative sales per store
df['Cumulative_Sales'] = df.groupby('Store')['Sales'].cumsum()

# Update the features list
features.append('Cumulative_Sales')

# Update X and y
X = df[features]
y = df[target]


In [56]:
import lightgbm as lgb
from lightgbm import early_stopping

# Split the data (if not already split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Set parameters
params = {
    'objective': 'regression',
    'metric': 'mae',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1,
    'random_state': 42
}

# Define early stopping
callbacks = [early_stopping(stopping_rounds=50)]

# Train the model
lgbm_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, test_data],
    callbacks=callbacks
)

# Make predictions
y_pred_lgbm = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)

# Evaluate the model
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
rmse_lgbm = root_mean_squared_error(y_test, y_pred_lgbm)

print('LightGBM MAE:', mae_lgbm)
print('LightGBM RMSE:', rmse_lgbm)


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 499.882	valid_1's l1: 532.42
LightGBM MAE: 532.4202688559542
LightGBM RMSE: 776.6612915362568


In [9]:
import pandas as pd

# Load the processed data
df = pd.read_csv('../data/processed/sales_data_processed.csv')

# Ensure the 'Date' column is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Sort the data by 'Store' and 'Date' to maintain chronological order
df.sort_values(by=['Store', 'Date'], inplace=True)

# Create lag features for previous days
df['Sales_Lag_1'] = df.groupby('Store')['Sales'].shift(1)
df['Sales_Lag_2'] = df.groupby('Store')['Sales'].shift(2)
df['Sales_Lag_3'] = df.groupby('Store')['Sales'].shift(3)
df['Sales_Lag_7'] = df.groupby('Store')['Sales'].shift(7)
df['Sales_Lag_14'] = df.groupby('Store')['Sales'].shift(14)

# Rolling mean features
df['Rolling_Mean_3'] = df.groupby('Store')['Sales'].shift(1).rolling(window=3).mean()
df['Rolling_Mean_7'] = df.groupby('Store')['Sales'].shift(1).rolling(window=7).mean()
df['Rolling_Mean_14'] = df.groupby('Store')['Sales'].shift(1).rolling(window=14).mean()
df['Rolling_Mean_30'] = df.groupby('Store')['Sales'].shift(1).rolling(window=30).mean()

# Rolling standard deviation features
df['Rolling_Std_7'] = df.groupby('Store')['Sales'].shift(1).rolling(window=7).std()
df['Rolling_Std_14'] = df.groupby('Store')['Sales'].shift(1).rolling(window=14).std()
df['Rolling_Std_30'] = df.groupby('Store')['Sales'].shift(1).rolling(window=30).std()

# EWMA feature
df['EWMA_12'] = df.groupby('Store')['Sales'].shift(1).ewm(span=12, adjust=False).mean()

# Cumulative sales per store (excluding current day)
df['Cumulative_Sales'] = df.groupby('Store')['Sales'].cumsum() - df['Sales']



In [11]:
import numpy as np

df['DayOfYear'] = df['Date'].dt.dayofyear
df['Sin_DayOfYear'] = np.sin(2 * np.pi * df['DayOfYear'] / 365.25)
df['Cos_DayOfYear'] = np.cos(2 * np.pi * df['DayOfYear'] / 365.25)
df['Promo_SchoolHoliday'] = df['Promo'] * df['SchoolHoliday']

# Drop rows with any missing values
df.dropna(inplace=True)

# List of features
features = [
    'Store', 'DayOfWeek', 'Promo', 'SchoolHoliday', 'CompetitionDistance',
    'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
    'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
    'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_b', 'Assortment_c',
    'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c',
    'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_7', 'Sales_Lag_14',
    'Rolling_Mean_3', 'Rolling_Mean_7', 'Rolling_Mean_14', 'Rolling_Mean_30',
    'Rolling_Std_7', 'Rolling_Std_14', 'Rolling_Std_30',
    'EWMA_12', 'Cumulative_Sales',
    'Sin_DayOfYear', 'Cos_DayOfYear',
    'Promo_SchoolHoliday'
]

target = 'Sales'

# Create the feature matrix X and target vector y
X = df[features]
y = df[target]


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [13]:
from catboost import CatBoostRegressor
import numpy as np
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

# Identify categorical features by their indices
categorical_features_indices = np.where(X.dtypes == 'object')[0]

# Initialize CatBoostRegressor
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='MAE',
    random_seed=42,
    early_stopping_rounds=50,
    verbose=False
)

# Train the model
cat_model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    cat_features=categorical_features_indices,
    use_best_model=True
)

# Make predictions
y_pred_cat = cat_model.predict(X_test)

# Evaluate the model
mae_cat = mean_absolute_error(y_test, y_pred_cat)
rmse_cat = root_mean_squared_error(y_test, y_pred_cat)

print('CatBoost MAE:', mae_cat)
print('CatBoost RMSE:', rmse_cat)


CatBoost MAE: 607.5347114512367
CatBoost RMSE: 889.5534131662988


In [24]:
import optuna
import lightgbm as lgb
from lightgbm import early_stopping

def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'mae',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42
    }
    lgbm_train = lgb.Dataset(X_train, label=y_train)

    # Define the early stopping callback
    callbacks = [early_stopping(stopping_rounds=50)]

    cv_results = lgb.cv(
        params,
        lgbm_train,
        nfold=3,
        num_boost_round=1000,
        callbacks=callbacks,
        stratified=False,
        metrics='mae',
        seed=42
    )
    return min(cv_results['mae-mean'])

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print('Best trial:', study.best_trial.params)


[I 2024-11-26 01:45:30,336] A new study created in memory with name: no-name-4782615a-70a0-49db-bd74-e65cca3d82a3


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4732
[LightGBM] [Info] Number of data points in the train set: 432472, number of used features: 38
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014581 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4732
[LightGBM] [Info] Number of data points in the train set: 432472, number of used features: 38
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4732
[LightGBM] [Info] Number of data points in the train set: 432472, number of used features: 38
[L

[W 2024-11-26 01:46:10,674] Trial 0 failed with parameters: {'learning_rate': 0.03278340675495843, 'num_leaves': 33, 'max_depth': 9, 'min_child_samples': 23, 'subsample': 0.5806422266523936, 'colsample_bytree': 0.5303104898202871} because of the following error: KeyError('mae-mean').
Traceback (most recent call last):
  File "d:\GitHub\retailiq\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\haoju\AppData\Local\Temp\ipykernel_21744\3011141253.py", line 32, in objective
    return min(cv_results['mae-mean'])
               ~~~~~~~~~~^^^^^^^^^^^^
KeyError: 'mae-mean'
[W 2024-11-26 01:46:10,677] Trial 0 failed with value None.


Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid l1: 565.666 + 0.490154


KeyError: 'mae-mean'

In [18]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model
model.fit(X_train, y_train)


In [26]:
# Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Define the early stopping callback
callbacks = [early_stopping(stopping_rounds=50)]

# LightGBM
import lightgbm as lgb
params = {
    'objective': 'regression',
    'metric': 'mae',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1,
    'random_state': 42
}

lgbm_train = lgb.Dataset(X_train, label=y_train)
lgbm_model = lgb.train(
    params,
    lgbm_train,
    num_boost_round=1000,
    valid_sets=[lgbm_train],
    callbacks=callbacks
)

# CatBoost
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='MAE',
    random_seed=42,
    early_stopping_rounds=50,
    verbose=False
)
cat_model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    cat_features=categorical_features_indices,
    use_best_model=True
)


KeyboardInterrupt: 

In [None]:
import lightgbm as lgb

# Assume you have predictions from Random Forest, LightGBM, and CatBoost
y_pred_rf = model.predict(X_test)
y_pred_lgbm = lgbm_model.predict(X_test)
y_pred_cat = cat_model.predict(X_test)

# Average predictions
y_pred_ensemble = (y_pred_rf + y_pred_lgbm + y_pred_cat) / 3

# Evaluate the ensemble
mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
rmse_ensemble = root_mean_squared_error(y_test, y_pred_ensemble)

print('Ensemble MAE:', mae_ensemble)
print('Ensemble RMSE:', rmse_ensemble)


NameError: name 'lgbm_model' is not defined

In [46]:
from sklearn.linear_model import LinearRegression

# Create a DataFrame with base model predictions
stacked_predictions = pd.DataFrame({
    'rf': y_pred_rf,
    'lgbm': y_pred_lgbm,
    'cat': y_pred_cat
})

# Train a meta-model
meta_model = LinearRegression()
meta_model.fit(stacked_predictions, y_test)

# Make meta-model predictions
y_pred_meta = meta_model.predict(stacked_predictions)

# Evaluate the meta-model
mae_meta = mean_absolute_error(y_test, y_pred_meta)
rmse_meta = root_mean_squared_error(y_test, y_pred_meta)

print('Stacked Model MAE:', mae_meta)
print('Stacked Model RMSE:', rmse_meta)


NameError: name 'y_pred_rf' is not defined

In [48]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

# Select features and target
features_for_lstm = ['Sales']  # Include necessary features
df_lstm = df[['Date', 'Store'] + features_for_lstm]

# Scale data
scaler = MinMaxScaler()
df_lstm[features_for_lstm] = scaler.fit_transform(df_lstm[features_for_lstm])

# Prepare sequences
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length)])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

seq_length = 30
X_lstm, y_lstm = create_sequences(df_lstm['Sales'].values, seq_length)

# Split into training and testing sets
split = int(0.8 * len(X_lstm))
X_train_lstm, X_test_lstm = X_lstm[:split], X_lstm[split:]
y_train_lstm, y_test_lstm = y_lstm[:split], y_lstm[split:]

# Reshape input data to (samples, time steps, features)
X_train_lstm = X_train_lstm.reshape((X_train_lstm.shape[0], seq_length, 1))
X_test_lstm = X_test_lstm.reshape((X_test_lstm.shape[0], seq_length, 1))


ValueError: Name tf.RaggedTensorSpec has already been registered for class tensorflow.python.ops.ragged.ragged_tensor.RaggedTensorSpec.

In [49]:
# Build the model
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(seq_length, 1)))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mae')

# Train the model
model_lstm.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=64, validation_split=0.1)

# Make predictions
y_pred_lstm = model_lstm.predict(X_test_lstm)

# Inverse transform the predictions and actual values
y_pred_lstm = scaler.inverse_transform(y_pred_lstm)
y_test_lstm = scaler.inverse_transform(y_test_lstm.reshape(-1, 1))

# Evaluate the model
mae_lstm = mean_absolute_error(y_test_lstm, y_pred_lstm)
rmse_lstm = root_mean_squared_error(y_test_lstm, y_pred_lstm)

print('LSTM MAE:', mae_lstm)
print('LSTM RMSE:', rmse_lstm)


NameError: name 'Sequential' is not defined