In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor, DMatrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Provide the correct path to the CSV files
train_file_path = '/Users/luanagiusto/PycharmProjects/forecasting_retail/train_eng.csv'
test_file_path = '/Users/luanagiusto/PycharmProjects/forecasting_retail/test_eng.csv'

In [3]:
#load the data 
dtype_spec_test = {
    'column_6_name': 'str',  # Replace 'column_6_name' with the actual name of column 6 in merged_test.csv
}

dtype_spec_train = {
    'column_7_name': 'str',  # Replace 'column_7_name' with the actual name of column 7 in merged_train.csv
}

train_eng_xgboost = pd.read_csv(train_file_path, dtype=dtype_spec_test, low_memory=False)
test_eng_xgboost = pd.read_csv(test_file_path, dtype=dtype_spec_test, low_memory=False)

In [4]:
train_eng_xgboost.head()

Unnamed: 0,store,dayofweek,date,sales,customers,open,promo,stateholiday,schoolholiday,competitiondistance,...,"promointerval_Jan,Apr,Jul,Oct","promointerval_Mar,Jun,Sept,Dec",promo2_start_date,promo2_start_month,promo2_start_year,promo2_start_day,promo2_start_day_of_week,quarter,week_of_year,is_weekend
0,1,5,2015-07-31,5263,555,1,1,0,1,1270.0,...,False,False,,,,,,3,31,False
1,2,5,2015-07-31,6064,625,1,1,0,1,570.0,...,True,False,2010-03-29,3.0,2010.0,29.0,0.0,3,31,False
2,3,5,2015-07-31,8314,821,1,1,0,1,14130.0,...,True,False,2011-04-04,4.0,2011.0,4.0,0.0,3,31,False
3,4,5,2015-07-31,13995,1498,1,1,0,1,620.0,...,False,False,,,,,,3,31,False
4,5,5,2015-07-31,4822,559,1,1,0,1,29910.0,...,False,False,,,,,,3,31,False


In [None]:
test_eng_xgboost.head()

In [None]:

# Provide the correct path to the CSV files
train_file_path = '/Users/luanagiusto/PycharmProjects/forecasting_retail/train_eng.csv'
test_file_path = '/Users/luanagiusto/PycharmProjects/forecasting_retail/test_eng.csv'

# Load the data
dtype_spec_test = {
    'column_6_name': 'str',  # Replace 'column_6_name' with the actual name of column 6 in merged_test.csv
}

dtype_spec_train = {
    'column_7_name': 'str',  # Replace 'column_7_name' with the actual name of column 7 in merged_train.csv
}

train_eng_xgboost = pd.read_csv(train_file_path, dtype=dtype_spec_test, low_memory=False)
test_eng_xgboost = pd.read_csv(test_file_path, dtype=dtype_spec_test, low_memory=False)

# Define the custom RMSPE Objective function for XGBoost
def rmspe_objective(preds, dtrain):
    labels = dtrain.get_label()
    preds /= np.maximum(preds, 1e-10)  # Prevent division by zero for small values of preds

    # Calculate the RMSPE gradient and hessian
    diff = (preds - labels) / labels
    grad = diff / (labels + 1e-10)  # Adding epsilon to avoid division by zero
    hess = (1.0 / labels**2)  # Hessian based on the RMSPE definition

    return grad, hess

# Define RMSPE evaluation metric
def rmspe_eval(y_true, y_pred):
    return "rmspe", np.sqrt(np.mean(np.square((y_true - y_pred) / (y_true + 1e-10))))

# Ensure 'sales' and 'customers' columns are in the train set
dep_var = 'sales'
if dep_var not in train_eng_xgboost.columns or 'customers' not in train_eng_xgboost.columns:
    raise KeyError(f"Columns {dep_var} and 'customers' must be present in the training set")

# Separate features (X) and target variable (y) for the train set
X_train = train_eng_xgboost.drop(columns=[dep_var, 'customers'])
y_train = train_eng_xgboost[dep_var]

# Ensure 'sales' and 'customers' are removed from the test set
X_test = test_eng_xgboost.drop(columns=['sales', 'customers'], errors='ignore')

# Encode categorical variables (if any)
cat_names = train_eng_xgboost.select_dtypes(include=['object', 'category']).columns.tolist()
X_train = pd.get_dummies(X_train, columns=cat_names, drop_first=True)
X_test = pd.get_dummies(X_test, columns=cat_names, drop_first=True)

# Align test set columns with train set columns to avoid mismatch
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Optionally, scale the continuous features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Split data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Convert the datasets to DMatrix format (necessary for custom objectives in XGBoost)
dtrain = DMatrix(X_train_split, label=y_train_split)
dval = DMatrix(X_val_split, label=y_val_split)
dtest = DMatrix(X_test_scaled)

# Parameters for XGBoost
params = {
    'objective': rmspe_objective,
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 100,
    'random_state': 42,
    'eval_metric': 'rmspe'
}

# Train the model with custom RMSPE objective function
evals_result = {}
xgb_model = XGBRegressor(**params)
xgb_model.fit(X_train_split, y_train_split, eval_set=[(X_val_split, y_val_split)], evals_result=evals_result, verbose=True)

# Extract 'Id' column from the test set
test_ids = test_eng_xgboost['id']

# Predict on the test set
y_test_pred = xgb_model.predict(X_test_scaled)

# Ensure the predictions are non-negative (since sales cannot be negative)
y_test_pred = np.maximum(0, y_test_pred)

# Create a submission dataframe with 'Id' from the test set and 'Sales' from the predictions
sample_submission_xgboost = pd.DataFrame({
    'Id': test_ids,
    'Sales': y_test_pred
})

# Output the sample submission file
submission_path = '/Users/luanagiusto/PycharmProjects/forecasting_retail/sample_submission_xgboost.csv'
sample_submission_xgboost.to_csv(submission_path, index=False)

print(f'Sample submission saved to {submission_path}')