# Multi-Layer Accurate Sales Forecasting
## Future Interns - Task 1 (Production Solution)

This solution uses **XGBoost** with automated hyperparameter tuning and **Cyclical Feature Engineering** to achieve maximum accuracy in sales demand prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

### 1. Robust Data Preprocessing

In [None]:
FILE_PATH = os.path.join('..', 'data', 'Online Retail.xlsx')
if not os.path.exists(FILE_PATH):
    FILE_PATH = r'c:\Users\Admin\OneDrive\Desktop\dataset\Online Retail.xlsx'

df = pd.read_excel(FILE_PATH)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Sales'] = df['Quantity'] * df['UnitPrice']

clean_df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)].dropna(subset=['CustomerID'])
daily_sales = clean_df.groupby(clean_df['InvoiceDate'].dt.date)['Sales'].sum().reset_index()
daily_sales.columns = ['Date', 'Sales']
daily_sales['Date'] = pd.to_datetime(daily_sales['Date'])
daily_sales = daily_sales.sort_values('Date').set_index('Date')

# Handle Outliers & Gaps
upper_cap = daily_sales['Sales'].quantile(0.75) + 3* (daily_sales['Sales'].quantile(0.75) - daily_sales['Sales'].quantile(0.25))
daily_sales['Sales'] = daily_sales['Sales'].clip(upper=upper_cap)
daily_sales = daily_sales.asfreq('D').fillna(0)

daily_sales.plot(figsize=(12, 4), title='Cleaned Sales Baseline')

### 2. Cyclical Feature Engineering
We use Sin/Cos transformations to help the ML model understand that December 31st is close to January 1st (Seasonality) and Sunday is close to Monday.

In [None]:
def create_advanced_features(df):
    data = df.copy()
    data['day'] = data.index.dayofweek
    data['month'] = data.index.month
    data['month_sin'] = np.sin(2 * np.pi * data['month']/12)
    data['month_cos'] = np.cos(2 * np.pi * data['month']/12)
    for l in [1, 7, 30]:
        data[f'lag_{l}'] = data['Sales'].shift(l)
    data['rolling_7'] = data['Sales'].shift(1).rolling(7).mean()
    return data.dropna()

model_data = create_advanced_features(daily_sales)
X = model_data.drop('Sales', axis=1)
y = model_data['Sales']

### 3. Hyperparameter Tuning & Training

In [None]:
split = len(X) - 30
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

tuned_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=5, subsample=0.8)
tuned_model.fit(X_train, y_train)

preds = tuned_model.predict(X_test)
print(f"Validation MAE: {mean_absolute_error(y_test, preds):.2f}")

### 4. Forecast Visualization

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(y_test.index, y_test.values, label='Actual')
plt.plot(y_test.index, preds, label='Tuned XGBoost Output', color='red')
plt.title('Final Validation Performance')
plt.legend()
plt.show()