In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [None]:
# Load the data
df = pd.read_csv('data/Superstore.csv')

# Convert date columns to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])

# Extract date features
df['Year'] = df['Order Date'].dt.year
df['Month'] = df['Order Date'].dt.month
df['Day'] = df['Order Date'].dt.day
df['DayOfWeek'] = df['Order Date'].dt.dayofweek

# Drop unnecessary columns
columns_to_drop = ['Row ID', 'Order ID', 'Ship Date', 'Customer Name', 
                   'Product ID', 'Product Name', 'Country', 'Order Date',
                   'Postal Code', 'Customer ID']
df = df.drop(columns=columns_to_drop)

# Convert categorical variables to numeric using label encoding
categorical_columns = ['Ship Mode', 'Segment', 'Category', 'Sub-Category', 'Region', 'State']
for col in categorical_columns:
    df[col] = pd.Categorical(df[col]).codes

print("Data shape after initial preprocessing:", df.shape)

In [None]:
def remove_outliers(df, columns, n_std=3):
    """
    Remove outliers from specified columns using z-score method
    """
    df_clean = df.copy()
    
    for column in columns:
        # Calculate z-scores
        z_scores = stats.zscore(df_clean[column])
        # Create a mask for rows to keep
        mask = np.abs(z_scores) < n_std
        df_clean = df_clean[mask]
        print(f"Removed {len(df) - len(df_clean)} outliers from {column}")
    
    return df_clean

# Columns to check for outliers
numeric_columns = ['Sales', 'Quantity', 'Discount', 'Profit']

# Remove outliers
df_cleaned = remove_outliers(df, numeric_columns, n_std=3)

print("\nOriginal dataset shape:", df.shape)
print("Cleaned dataset shape:", df_cleaned.shape)

In [None]:
# Separate features and target
X = df_cleaned.drop('Sales', axis=1)
y = df_cleaned['Sales']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Validation set shape:", X_valid_scaled.shape)
print("Test set shape:", X_test_scaled.shape)

In [None]:
# Create and configure the XGBoost model with better parameters
model = XGBRegressor(
    n_estimators=1000,          # Increase number of trees
    learning_rate=0.01,         # Reduce learning rate
    max_depth=4,                # Reduce max_depth to prevent overfitting
    min_child_weight=5,         # Add min_child_weight to reduce overfitting
    subsample=0.8,              # Use only 80% of data per tree
    colsample_bytree=0.8,       # Use only 80% of features per tree
    reg_alpha=0.1,              # L1 regularization
    reg_lambda=1.0,             # L2 regularization
    random_state=42
)

# Train the model with early stopping
model.fit(
    X_train_scaled, 
    y_train,
    eval_set=[(X_train_scaled, y_train), (X_valid_scaled, y_valid)],
    eval_metric='rmse',
    early_stopping_rounds=50,    # Stop if no improvement for 50 rounds
    verbose=True
)

In [None]:
# Make predictions on test set
y_pred = model.predict(X_test_scaled)

# Calculate RMSE on test set
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
print(f"\nTest Set RMSE: {rmse:.2f}")

# Calculate R-squared score
r2 = 1 - np.sum((y_test - y_pred) ** 2) / np.sum((y_test - y_test.mean()) ** 2)
print(f"R-squared Score: {r2:.4f}")

# Calculate feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))