# Sales Prediction Model using XGBoost

In this notebook, we'll build a sales prediction model using XGBoost. We'll:
1. Load and preprocess the data
2. Prepare features and target
3. Train the XGBoost model
4. Evaluate model performance

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import plotly.express as px

In [None]:
# Load the cleaned dataset
df = pd.read_csv('../data/cleaned_superstore_sales.csv')

# Drop ID columns and Postal Code
columns_to_drop = ['Row ID', 'Order ID', 'Customer ID', 'Postal Code']
df = df.drop(columns=columns_to_drop)

# Convert date columns to datetime
date_columns = ['Order Date', 'Ship Date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], format='mixed', dayfirst=True)

print("Dataset shape after dropping columns:", df.shape)

In [None]:
# Extract date features
def extract_date_features(df, date_column):
    df[f'{date_column}_Year'] = df[date_column].dt.year
    df[f'{date_column}_Month'] = df[date_column].dt.month
    df[f'{date_column}_Quarter'] = df[date_column].dt.quarter
    df[f'{date_column}_DayOfWeek'] = df[date_column].dt.dayofweek
    return df

# Extract features from both date columns
df = extract_date_features(df, 'Order Date')
df = extract_date_features(df, 'Ship Date')

# Drop original date columns
df = df.drop(columns=date_columns)

In [None]:
# Encode categorical variables
categorical_columns = ['Ship Mode', 'Customer Segment', 'Category', 'Sub-Category', 'Region', 'Country', 'State', 'City']

# Initialize dictionary to store label encoders
label_encoders = {}

# Encode each categorical column
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Separate features and target
X = df.drop('Sales', axis=1)
y = df['Sales']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train XGBoost model
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# Train the model
model.fit(
    X_train, 
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric='rmse',
    verbose=True
)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'Root Mean Squared Error: ${rmse:.2f}')
print(f'R² Score: {r2:.4f}')

In [None]:
# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

fig = px.bar(
    feature_importance.head(15), 
    x='importance', 
    y='feature',
    title='Top 15 Most Important Features',
    orientation='h'
)
fig.show()

In [None]:
# Plot actual vs predicted values
comparison_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

fig = px.scatter(
    comparison_df,
    x='Actual',
    y='Predicted',
    title='Actual vs Predicted Sales',
    labels={'Actual': 'Actual Sales ($)', 'Predicted': 'Predicted Sales ($)'}
)

# Add perfect prediction line
fig.add_scatter(
    x=[comparison_df.Actual.min(), comparison_df.Actual.max()],
    y=[comparison_df.Actual.min(), comparison_df.Actual.max()],
    mode='lines',
    name='Perfect Prediction',
    line=dict(dash='dash')
)

fig.show()