In [11]:
# Read data from the cleaned csv file
import pandas as pd

df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,index,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Year,Month,Day,Hour
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,2010,12,1,8
1,1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,1,8
2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,2010,12,1,8
3,3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,1,8
4,4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,1,8


In [13]:
# Data Preprocessing to Forecast
# Convert InvoiceDate to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [15]:
# Aggregate sales by daily total price
daily_sales = df.groupby(df['InvoiceDate'].dt.date)['TotalPrice'].sum().reset_index()
daily_sales.rename(columns={'InvoiceDate': 'Date'}, inplace=True)

In [17]:
# Sort by date
daily_sales['Date'] = pd.to_datetime(daily_sales['Date'])
daily_sales = daily_sales.sort_values(by='Date').reset_index(drop=True)
print(daily_sales)

          Date  TotalPrice
0   2010-12-01    46376.49
1   2010-12-02    47316.53
2   2010-12-03    23921.71
3   2010-12-05    31771.60
4   2010-12-06    31215.64
..         ...         ...
300 2011-12-05    58202.21
301 2011-12-06    46144.04
302 2011-12-07    69354.21
303 2011-12-08    50519.41
304 2011-12-09   184349.28

[305 rows x 2 columns]


In [19]:
#Feature Engineering - Create total price lagged features for time series forecasting with XGBregressor

In [33]:
# Create lag features (previous sales as input variables)
def create_lag_features(data, lag_days):
    for lag in range(1, lag_days + 1):
        data[f'Lag_{lag}'] = data['TotalPrice'].shift(lag)
    return data

In [39]:
# Define the number of lag days
lag_days = 7 

In [41]:
# Create lag features
daily_sales = create_lag_features(daily_sales, lag_days)
daily_sales.head()

Unnamed: 0,Date,TotalPrice,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Lag_6,Lag_7
0,2010-12-01,46376.49,,,,,,,
1,2010-12-02,47316.53,46376.49,,,,,,
2,2010-12-03,23921.71,47316.53,46376.49,,,,,
3,2010-12-05,31771.6,23921.71,47316.53,46376.49,,,,
4,2010-12-06,31215.64,31771.6,23921.71,47316.53,46376.49,,,


In [73]:
#Drop Missing Values from Lag fetaures

In [75]:
# Drop NaN values resulting from lag feature creation
daily_sales = daily_sales.dropna().reset_index(drop=True)
daily_sales

Unnamed: 0,Date,TotalPrice,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Lag_6,Lag_7
0,2010-12-09,38231.90,39248.82,53795.31,31215.64,31771.60,23921.71,47316.53,46376.49
1,2010-12-10,33650.28,38231.90,39248.82,53795.31,31215.64,31771.60,23921.71,47316.53
2,2010-12-12,17305.77,33650.28,38231.90,39248.82,53795.31,31215.64,31771.60,23921.71
3,2010-12-13,27642.68,17305.77,33650.28,38231.90,39248.82,53795.31,31215.64,31771.60
4,2010-12-14,29322.30,27642.68,17305.77,33650.28,38231.90,39248.82,53795.31,31215.64
...,...,...,...,...,...,...,...,...,...
293,2011-12-05,58202.21,20375.96,44713.69,44533.99,41481.23,48851.68,51831.67,17300.96
294,2011-12-06,46144.04,58202.21,20375.96,44713.69,44533.99,41481.23,48851.68,51831.67
295,2011-12-07,69354.21,46144.04,58202.21,20375.96,44713.69,44533.99,41481.23,48851.68
296,2011-12-08,50519.41,69354.21,46144.04,58202.21,20375.96,44713.69,44533.99,41481.23


In [77]:
#Create the train , test split

In [79]:
# Split data into train and test sets (80% train, 20% test)
train_size = int(len(daily_sales) * 0.8)
train, test = daily_sales[:train_size], daily_sales[train_size:]

In [81]:
train.head() #just to see the how train test looks like

Unnamed: 0,Date,TotalPrice,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Lag_6,Lag_7
0,2010-12-09,38231.9,39248.82,53795.31,31215.64,31771.6,23921.71,47316.53,46376.49
1,2010-12-10,33650.28,38231.9,39248.82,53795.31,31215.64,31771.6,23921.71,47316.53
2,2010-12-12,17305.77,33650.28,38231.9,39248.82,53795.31,31215.64,31771.6,23921.71
3,2010-12-13,27642.68,17305.77,33650.28,38231.9,39248.82,53795.31,31215.64,31771.6
4,2010-12-14,29322.3,27642.68,17305.77,33650.28,38231.9,39248.82,53795.31,31215.64


In [87]:
#Model Training

In [95]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define features and target
X_train = train.drop(columns=['Date', 'TotalPrice'])
y_train = train['TotalPrice']
X_test = test.drop(columns=['Date', 'TotalPrice'])
y_test = test['TotalPrice']

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

In [97]:
# Initialize XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror')

In [99]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [103]:
# Get best parameters
best_params = grid_search.best_params_

In [113]:
# Train final model with best parameters
best_xgb_model = XGBRegressor(objective='reg:squarederror', **best_params)
best_xgb_model.fit(X_train, y_train)

# Print model summary
xgb_model

In [117]:
#save the pickeled model
import pickle
pickle.dump(best_xgb_model, open('best_xgb_model.pkl', 'wb'))

In [121]:
# Predict on test set
y_pred = best_xgb_model.predict(X_test)
y_pred

array([27355.312, 25995.9  , 27777.93 , 27429.043, 32512.244, 34571.168,
       24891.053, 46852.055, 41645.945, 35742.574, 32948.434, 25621.258,
       25506.938, 29228.92 , 30465.76 , 28747.121, 27518.854, 27020.938,
       25666.693, 27950.193, 36130.34 , 34220.973, 34576.34 , 29942.445,
       25514.914, 29708.078, 28948.697, 28628.627, 33859.965, 28480.895,
       26022.35 , 29166.578, 31099.285, 30415.523, 35891.367, 36166.977,
       29013.299, 32051.418, 39142.523, 36892.914, 36921.785, 36239.176,
       28542.467, 33442.105, 34218.105, 34389.19 , 36648.566, 26921.006,
       28856.19 , 29066.79 , 34066.227, 35864.22 , 30525.   , 25983.926,
       26305.898, 43516.594, 33324.086, 29240.723, 31947.938, 30455.57 ],
      dtype=float32)