In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

In [14]:
X_full = pd.read_csv('/Users/nickmac/Desktop/Dev Stuff/Kaggle/Black Friday/train.csv', index_col='User_ID')

X_test = pd.read_csv('/Users/nickmac/Desktop/Dev Stuff/Kaggle/Black Friday/test.csv', index_col='User_ID')

In [15]:
# remove columns with missing target

X_full.dropna(axis=0, subset=['Purchase'], inplace=True)
y = X_full['Purchase']
X_full.drop(['Purchase'], axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2,
                                                      random_state=42)

In [16]:
# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

In [17]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [18]:
# XGBoost Model with hyperparameter search space
xgb_model = XGBRegressor(random_state=42)


param_distributions = {
    'regressor__n_estimators': [50, 100, 200, 500],  # Number of trees
    'regressor__learning_rate': [0.01, 0.1, 0.2],   # Step size shrinkage
    'regressor__max_depth': [3, 5, 7],                # Maximum tree depth
    'regressor__subsample': [0.8, 0.9, 1.0],         # Fraction of samples used for fitting each tree
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]   # Fraction of features used for fitting each tree
}

In [19]:
# Pipeline with Randomized Search
random_search = RandomizedSearchCV(
    estimator=Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', xgb_model)
    ]),
    param_distributions=param_distributions, 
    n_iter=100,  # Number of parameter combinations to try
    scoring='neg_mean_absolute_error',
    cv=5,  
    verbose=1,
)

random_search.fit(X_train, y_train)

print("Best parameters found by random search:", random_search.best_params_)
print("Best MAE found by random search:", -random_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
# Grid Search (Refine around best parameters from random search)
param_grid = {
    'regressor__n_estimators': [random_search.best_params_['regressor__n_estimators'] - 25, 
                                random_search.best_params_['regressor__n_estimators'], 
                                random_search.best_params_['regressor__n_estimators'] + 25],
    'regressor__learning_rate': [random_search.best_params_['regressor__learning_rate'] - 0.05, 
                                 random_search.best_params_['regressor__learning_rate'], 
                                 random_search.best_params_['regressor__learning_rate'] + 0.05],
    'regressor__max_depth': [random_search.best_params_['regressor__max_depth'] - 1,
                            random_search.best_params_['regressor__max_depth'],
                            random_search.best_params_['regressor__max_depth'] + 1]
}

grid_search = GridSearchCV(estimator=random_search.best_estimator_, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# Get the best model from grid search and make predictions
best_model = grid_search.best_estimator_

print("Best parameters found by grid search:", grid_search.best_params_)
print("Best MAE found by grid search:", -grid_search.best_score_)

In [None]:
preds = best_model.predict(X_valid)

# Calculate MAE on the validation set
mae_valid = mean_absolute_error(y_valid, preds)
print(f"Mean Absolute Error (Validation Set): {mae_valid}")

----------

In [None]:
# # Convert processed data back to DataFrames
# X_train_processed = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
# X_valid_processed = pd.DataFrame(X_valid_processed, columns=preprocessor.get_feature_names_out())

In [None]:
# # Fit and transform training data
# X_train_processed = preprocessor.fit_transform(X_train)

# # Transform validation data (using the same preprocessor)
# X_valid_processed = preprocessor.transform(X_valid)

In [None]:
# # XGBoost Model
# xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.1, random_state=42)

# # Create the full pipeline
# full_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('regressor', xgb_model) 
# ])

In [None]:
# # Fit the pipeline (includes preprocessing and model training)
# full_pipeline.fit(X_train, y_train, regressor__early_stopping_rounds=5, regressor__eval_set=[(X_valid, y_valid)], regressor__verbose=False)


# # Get predictions on the validation set
# preds = full_pipeline.predict(X_valid)

In [5]:
# # Separate numerical and categorical columns
# X_train_num = X_train.select_dtypes(exclude='object')
# X_train_cat = X_train.select_dtypes(include='object')

# X_valid_num = X_valid.select_dtypes(exclude='object')
# X_valid_cat = X_valid.select_dtypes(include='object')

In [6]:
# # 1. Numerical Imputation (Mean)
# num_imputer = SimpleImputer(strategy='mean')  
# imputed_X_train_num = pd.DataFrame(num_imputer.fit_transform(X_train_num))
# imputed_X_valid_num = pd.DataFrame(num_imputer.transform(X_valid_num))

# # Reassign column names after imputation
# imputed_X_train_num.columns = X_train_num.columns
# imputed_X_valid_num.columns = X_valid_num.columns

In [7]:
# # 2. Categorical Imputation (Most Frequent)
# cat_imputer = SimpleImputer(strategy='most_frequent')
# imputed_X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train_cat))
# imputed_X_valid_cat = pd.DataFrame(cat_imputer.transform(X_valid_cat))

# # Reassign column names after imputation
# imputed_X_train_cat.columns = X_train_cat.columns
# imputed_X_valid_cat.columns = X_valid_cat.columns

In [11]:
# # One-Hot Encoding
# OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(imputed_X_train_cat))
# OH_cols_valid = pd.DataFrame(OH_encoder.transform(imputed_X_valid_cat))

# # Restore index after encoding
# OH_cols_train.index = imputed_X_train_cat.index
# OH_cols_valid.index = imputed_X_valid_cat.index

# # Ensure all columns have string type
# OH_cols_train.columns = OH_cols_train.columns.astype(str)
# OH_cols_valid.columns = OH_cols_valid.columns.astype(str)

In [12]:
# # Combine numerical and OH encoded data
# final_X_train = pd.concat([imputed_X_train_num, OH_cols_train], axis=1)
# final_X_valid = pd.concat([imputed_X_valid_num, OH_cols_valid], axis=1)

To Do

- Get all numerical and categorical variables
- Impute numerical variables and replace categorical with most frequent strategy
- You also need to process the test data