In [11]:
# Read data from the cleaned csv file
import pandas as pd

df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,index,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Year,Month,Day,Hour
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,2010,12,1,8
1,1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,1,8
2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,2010,12,1,8
3,3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,1,8
4,4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,1,8


In [13]:
# Data Preprocessing to Forecast
# Convert InvoiceDate to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [15]:
# Aggregate sales by daily total price
daily_sales = df.groupby(df['InvoiceDate'].dt.date)['TotalPrice'].sum().reset_index()
daily_sales.rename(columns={'InvoiceDate': 'Date'}, inplace=True)

In [17]:
# Sort by date
daily_sales['Date'] = pd.to_datetime(daily_sales['Date'])
daily_sales = daily_sales.sort_values(by='Date').reset_index(drop=True)
print(daily_sales)

          Date  TotalPrice
0   2010-12-01    46376.49
1   2010-12-02    47316.53
2   2010-12-03    23921.71
3   2010-12-05    31771.60
4   2010-12-06    31215.64
..         ...         ...
300 2011-12-05    58202.21
301 2011-12-06    46144.04
302 2011-12-07    69354.21
303 2011-12-08    50519.41
304 2011-12-09   184349.28

[305 rows x 2 columns]


In [19]:
#Feature Engineering - Create total price lagged features for time series forecasting with XGBregressor

In [21]:
# Create lag features (previous sales as input variables)
def create_lag_features(data, lag_days):
    for lag in range(1, lag_days + 1):
        data[f'Lag_{lag}'] = data['TotalPrice'].shift(lag)
    return data

In [23]:
# Define the number of lag days
lag_days = 7 

In [25]:
# Create lag features
daily_sales = create_lag_features(daily_sales, lag_days)
daily_sales.head()

Unnamed: 0,Date,TotalPrice,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Lag_6,Lag_7
0,2010-12-01,46376.49,,,,,,,
1,2010-12-02,47316.53,46376.49,,,,,,
2,2010-12-03,23921.71,47316.53,46376.49,,,,,
3,2010-12-05,31771.6,23921.71,47316.53,46376.49,,,,
4,2010-12-06,31215.64,31771.6,23921.71,47316.53,46376.49,,,
