# Goal of the notebook

The goal of the notebook is to build a dataframe that will get information related to the oil market, holidays and on the stores.

We will then apply an xg boost to this transformed data frame.

In [18]:
import pandas as pd

df_holidays = pd.read_csv('./data/holidays_events.csv')
df_oil = pd.read_csv('./data/oil.csv')
df_stores = pd.read_csv('./data/stores.csv')
df_transactions = pd.read_csv('./data/transactions.csv')
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

In [19]:
df_test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [20]:
df_transactions = df_transactions.groupby(['date'], as_index=False)['transactions'].sum().rename(columns = {"transactions" : "tot_transactions"}).merge(df_transactions)
df_transactions['store_weight'] = df_transactions['transactions'] / df_transactions['tot_transactions']
df_transactions.head()

Unnamed: 0,date,tot_transactions,store_nbr,transactions,store_weight
0,2013-01-01,770,25,770,1.0
1,2013-01-02,93215,1,2111,0.022647
2,2013-01-02,93215,2,2358,0.025296
3,2013-01-02,93215,3,3487,0.037408
4,2013-01-02,93215,4,1922,0.020619


In [21]:
local_holiday = df_holidays[(df_holidays['transferred'] == False) &
                             (df_holidays['type'] == "Holiday")].merge(df_stores[['store_nbr', 'city']].rename(columns={'city': 'locale_name'}).drop_duplicates())
local_holiday['is_impacted_by_local_holiday'] = True

df_transactions_local= local_holiday[['date', 'store_nbr','is_impacted_by_local_holiday']].merge(df_transactions, on = ['store_nbr', 'date'], how = "right")
df_transactions_local['is_impacted_by_local_holiday'] = df_transactions_local['is_impacted_by_local_holiday'].fillna(False)
local_holiday_bis = df_holidays[(df_holidays['transferred'] == False) &
                             (df_holidays['type'] == "Transfer")]\
                              .merge(df_stores[['store_nbr', 'city']].rename(columns={'city': 'locale_name'}).drop_duplicates())
local_holiday_bis['is_impacted_by_local_holiday'] = True

df_transactions_local_bis= local_holiday[['date', 'store_nbr','is_impacted_by_local_holiday']]\
   .merge(df_transactions, on = ['store_nbr', 'date'], how = "right")
df_transactions_local_bis['is_impacted_by_local_holiday'] = df_transactions_local_bis['is_impacted_by_local_holiday'].fillna(False)
df_transactions_local_bis = df_transactions_local_bis[df_transactions_local_bis['is_impacted_by_local_holiday']]

df_transactions_local = pd.concat([df_transactions_local, df_transactions_local_bis])


regional_holiday = df_holidays[(df_holidays['transferred'] == False) &
                             (df_holidays['type'] == "Holiday")].merge(df_stores[['store_nbr', 'state']].rename(columns={'state': 'locale_name'}).drop_duplicates())
regional_holiday['is_impacted_by_regional_holiday'] = True

df_transactions_regional= regional_holiday[['date', 'store_nbr','is_impacted_by_regional_holiday']].merge(df_transactions, on = ['store_nbr', 'date'], how = "right")
df_transactions_regional['is_impacted_by_regional_holiday'] = df_transactions_regional['is_impacted_by_regional_holiday'].fillna(False)

date_min, date_max = (df_oil[df_oil['dcoilwtico'] > 60].sort_values(by = "date")['date'].min(),
       df_oil[df_oil['dcoilwtico'] > 60].sort_values(by = "date")['date'].max())

df_transactions_local['oil_impact'] = 0

df_transactions_local.loc[(df_transactions_local['date'] > date_max), "oil_impact"] = 1

  df_transactions_local['is_impacted_by_local_holiday'] = df_transactions_local['is_impacted_by_local_holiday'].fillna(False)
  df_transactions_local_bis['is_impacted_by_local_holiday'] = df_transactions_local_bis['is_impacted_by_local_holiday'].fillna(False)
  df_transactions_regional['is_impacted_by_regional_holiday'] = df_transactions_regional['is_impacted_by_regional_holiday'].fillna(False)


In [22]:
df_transactions_ = df_transactions_local.merge(df_transactions_regional[['date', 'store_nbr','is_impacted_by_regional_holiday']].drop_duplicates()).drop_duplicates()

In [23]:
df_transactions_[df_transactions_['is_impacted_by_regional_holiday']].head()

Unnamed: 0,date,store_nbr,is_impacted_by_local_holiday,tot_transactions,transactions,store_weight,oil_impact,is_impacted_by_regional_holiday
4106,2013-04-01,12,False,81678,1313,0.016075,0,True
4107,2013-04-01,13,False,81678,1125,0.013774,0,True
8062,2013-06-25,15,False,70549,1469,0.020822,0,True
9989,2013-08-05,43,True,82080,1411,0.017191,0,True
14327,2013-11-06,5,False,76148,1400,0.018385,0,True


In [24]:
df_holidays__ = df_holidays[(df_holidays["locale"] == "National") & (df_holidays["transferred"] == False) & 
                            (df_holidays["type"].isin(['Holiday', 'Additional', 'Bridge', 'Work Day', 'Event']))].copy()

# Add a column with True to indicate the presence of each type
df_holidays__['is_present'] = True

# Pivot the DataFrame, with date as the index and each type as a column
df_pivot = df_holidays__.pivot_table(index='date', columns='type', values='is_present', fill_value=False)
df_pivot.columns = ['national_' + col for col in df_pivot.columns]
# Display the result
df_pivot.reset_index(inplace=True)
for cols in df_pivot.columns:
   df_pivot[cols] = df_pivot[cols].replace({1.0: True})

  df_pivot[cols] = df_pivot[cols].replace({1.0: True})


**Not transferred days**

In [25]:
df_holidays__ = df_holidays[(df_holidays["locale"] == "National") & (df_holidays["transferred"] == True)].copy()

# Add a column with True to indicate the presence of each type
df_holidays__['is_present'] = True
df_holidays__['type'] = "Holiday"

# Pivot the DataFrame, with date as the index and each type as a column
df_pivot_bis = df_holidays__.pivot_table(index='date', columns='type', values='is_present', fill_value=False)
df_pivot_bis.columns = ['national_' + col for col in df_pivot_bis.columns]
# Display the result
df_pivot_bis.reset_index(inplace=True)
for cols in df_pivot_bis.columns:
   df_pivot_bis[cols] = df_pivot_bis[cols].replace({1.0: True})

df_pivot_bis['national_Additional'] = False
df_pivot_bis['national_Bridge'] = False
df_pivot_bis['national_Event'] = False
df_pivot_bis['national_Work Day'] = False

df_pivot = pd.concat([df_pivot, df_pivot_bis])
df_pivot.head()

  df_pivot_bis[cols] = df_pivot_bis[cols].replace({1.0: True})


Unnamed: 0,date,national_Additional,national_Bridge,national_Event,national_Holiday,national_Work Day
0,2012-08-10,False,False,False,True,False
1,2012-11-02,False,False,False,True,False
2,2012-11-03,False,False,False,True,False
3,2012-12-21,True,False,False,False,False
4,2012-12-22,True,False,False,False,False


In [26]:
df_pivot.columns

Index(['date', 'national_Additional', 'national_Bridge', 'national_Event',
       'national_Holiday', 'national_Work Day'],
      dtype='object')

In [27]:
df_transactions_ = df_transactions_.merge(df_pivot, how = 'left')

In [28]:
for cols in ['national_Additional', 'national_Bridge', 'national_Event',
       'national_Holiday', 'national_Work Day']:
   
   df_transactions_[cols] = df_transactions_[cols].fillna(False)

  df_transactions_[cols] = df_transactions_[cols].fillna(False)


In [29]:
df_transactions_.drop(index=df_transactions_.index[0], axis=0, inplace=True)
df_transactions_[df_transactions_.store_weight == df_transactions_.store_weight.max()]

Unnamed: 0,date,store_nbr,is_impacted_by_local_holiday,tot_transactions,transactions,store_weight,oil_impact,is_impacted_by_regional_holiday,national_Additional,national_Bridge,national_Event,national_Holiday,national_Work Day
34046,2015-01-01,25,False,2202,2202,1.0,0,False,False,False,False,True,False
71391,2017-01-01,25,False,1642,1642,1.0,1,False,False,False,False,True,False


Remove 1st row

In [30]:
df_train[df_train['onpromotion'] == 425]

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
2964402,2964402,2017-07-26,35,GROCERY I,2123.0,425


In [31]:
df_train.groupby(['date', "store_nbr"], as_index=False).agg({"sales" : "sum",
                                             "onpromotion" : "sum"}).rename(columns={"sales" : "tot_sales_per_store",
                                                                                     "onpromotion": "tot_promotion_per_store"})\
         .merge(df_train)

Unnamed: 0,date,store_nbr,tot_sales_per_store,tot_promotion_per_store,id,family,sales,onpromotion
0,2013-01-01,1,0.000,0,0,AUTOMOTIVE,0.000,0
1,2013-01-01,1,0.000,0,1,BABY CARE,0.000,0
2,2013-01-01,1,0.000,0,2,BEAUTY,0.000,0
3,2013-01-01,1,0.000,0,3,BEVERAGES,0.000,0
4,2013-01-01,1,0.000,0,4,BOOKS,0.000,0
...,...,...,...,...,...,...,...,...
3000883,2017-08-15,54,12666.858,204,3000751,POULTRY,59.619,0
3000884,2017-08-15,54,12666.858,204,3000752,PREPARED FOODS,94.000,0
3000885,2017-08-15,54,12666.858,204,3000753,PRODUCE,915.371,76
3000886,2017-08-15,54,12666.858,204,3000754,SCHOOL AND OFFICE SUPPLIES,0.000,0


In [50]:
#df_train.pivot(index=['date', "store_nbr"], columns="family", values="sales").reset_index()

# Final dataframe

In [35]:
df_train_preprocess = df_train.drop_duplicates().merge(df_transactions_, how="right").drop_duplicates()

In [36]:
df_train_preprocess.columns

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion',
       'is_impacted_by_local_holiday', 'tot_transactions', 'transactions',
       'store_weight', 'oil_impact', 'is_impacted_by_regional_holiday',
       'national_Additional', 'national_Bridge', 'national_Event',
       'national_Holiday', 'national_Work Day'],
      dtype='object')

In [46]:
df_train_preprocess.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,is_impacted_by_local_holiday,oil_impact,is_impacted_by_regional_holiday,national_Additional,national_Bridge,national_Event,national_Holiday,national_Work Day
0,1782,2013-01-02,1,AUTOMOTIVE,2.0,0,False,0,False,False,False,False,False,False
1,1783,2013-01-02,1,BABY CARE,0.0,0,False,0,False,False,False,False,False,False
2,1784,2013-01-02,1,BEAUTY,2.0,0,False,0,False,False,False,False,False,False
3,1785,2013-01-02,1,BEVERAGES,1091.0,0,False,0,False,False,False,False,False,False
4,1786,2013-01-02,1,BOOKS,0.0,0,False,0,False,False,False,False,False,False


In [95]:
df_train_preprocess = df_train_preprocess.map(lambda x : 1 if x==True else x)
df_train_preprocess = df_train_preprocess.map(lambda x : 0 if x==False else x)

In [45]:
df_train_preprocess.drop(['tot_transactions', 'transactions',
       'store_weight'], axis = 1, inplace=True)

In [47]:
df_train_preprocess.family.unique()

array(['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', 'BOOKS',
       'BREAD/BAKERY', 'CELEBRATION', 'CLEANING', 'DAIRY', 'DELI', 'EGGS',
       'FROZEN FOODS', 'GROCERY I', 'GROCERY II', 'HARDWARE',
       'HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES',
       'HOME CARE', 'LADIESWEAR', 'LAWN AND GARDEN', 'LINGERIE',
       'LIQUOR,WINE,BEER', 'MAGAZINES', 'MEATS', 'PERSONAL CARE',
       'PET SUPPLIES', 'PLAYERS AND ELECTRONICS', 'POULTRY',
       'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES',
       'SEAFOOD'], dtype=object)

# 1st approach with XGBOOST

## test dataframe

In [101]:
local_holiday = df_holidays[(df_holidays['transferred'] == False) &
                             (df_holidays['type'] == "Holiday")].merge(df_stores[['store_nbr', 'city']].rename(columns={'city': 'locale_name'}).drop_duplicates())
local_holiday['is_impacted_by_local_holiday'] = True
local_holiday

Unnamed: 0,date,type,locale,locale_name,description,transferred,store_nbr,is_impacted_by_local_holiday
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False,52,True
1,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False,53,True
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False,37,True
3,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False,39,True
4,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False,42,True
...,...,...,...,...,...,...,...,...
299,2017-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False,13,True
300,2017-11-12,Holiday,Local,Ambato,Independencia de Ambato,False,23,True
301,2017-11-12,Holiday,Local,Ambato,Independencia de Ambato,False,50,True
302,2017-12-08,Holiday,Local,Loja,Fundacion de Loja,False,38,True


In [102]:
local_holiday_bis = df_holidays[(df_holidays['transferred'] == False) &
                             (df_holidays['type'] == "Transfer") & (df_holidays['locale'] == "locale")]\
                              .merge(df_stores[['store_nbr', 'city']].rename(columns={'city': 'locale_name'}).drop_duplicates())
local_holiday_bis['is_impacted_by_local_holiday'] = True
local_holiday_bis.tail()

local_holiday = pd.concat([local_holiday, local_holiday_bis])

In [103]:
regional_holiday = df_holidays[(df_holidays['transferred'] == False) &
                             (df_holidays['type'] == "Holiday") & (df_holidays['locale'] == 'Regional')].merge(df_stores[['store_nbr', 'state']].rename(columns={'state': 'locale_name'}).drop_duplicates())
regional_holiday['is_impacted_by_regional_holiday'] = True
regional_holiday.tail()

Unnamed: 0,date,type,locale,locale_name,description,transferred,store_nbr,is_impacted_by_regional_holiday
37,2017-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False,15,True
38,2017-11-06,Holiday,Regional,Santo Domingo de los Tsachilas,Provincializacion de Santo Domingo,False,5,True
39,2017-11-06,Holiday,Regional,Santo Domingo de los Tsachilas,Provincializacion de Santo Domingo,False,16,True
40,2017-11-06,Holiday,Regional,Santo Domingo de los Tsachilas,Provincializacion de Santo Domingo,False,21,True
41,2017-11-07,Holiday,Regional,Santa Elena,Provincializacion Santa Elena,False,25,True


In [104]:
_holidays_ = local_holiday[['date', 'store_nbr', 'is_impacted_by_local_holiday']].merge(regional_holiday[['date', 'store_nbr', 'is_impacted_by_regional_holiday']], how="outer")
_holidays_.fillna(False, inplace=True)
_holidays_

  _holidays_.fillna(False, inplace=True)


Unnamed: 0,date,store_nbr,is_impacted_by_local_holiday,is_impacted_by_regional_holiday
0,2012-03-02,52,True,False
1,2012-03-02,53,True,False
2,2012-04-01,12,False,True
3,2012-04-01,13,False,True
4,2012-04-12,37,True,False
...,...,...,...,...
341,2017-11-11,13,True,False
342,2017-11-12,23,True,False
343,2017-11-12,50,True,False
344,2017-12-08,38,True,False


In [105]:
df_holidays__ = df_holidays[(df_holidays["locale"] == "National") & (df_holidays["transferred"] == False) & 
                            (df_holidays["type"].isin(['Holiday', 'Additional', 'Bridge', 'Work Day', 'Event']))].copy()

# Add a column with True to indicate the presence of each type
df_holidays__['is_present'] = True

# Pivot the DataFrame, with date as the index and each type as a column
df_pivot = df_holidays__.pivot_table(index='date', columns='type', values='is_present', fill_value=False)
df_pivot.columns = ['national_' + col for col in df_pivot.columns]
# Display the result
df_pivot.reset_index(inplace=True)
for cols in df_pivot.columns:
   df_pivot[cols] = df_pivot[cols].replace({1.0: True})
   
df_holidays__ = df_holidays[(df_holidays["locale"] == "National") & (df_holidays["transferred"] == True)].copy()

# Add a column with True to indicate the presence of each type
df_holidays__['is_present'] = True
df_holidays__['type'] = "Holiday"

# Pivot the DataFrame, with date as the index and each type as a column
df_pivot_bis = df_holidays__.pivot_table(index='date', columns='type', values='is_present', fill_value=False)
df_pivot_bis.columns = ['national_' + col for col in df_pivot_bis.columns]
# Display the result
df_pivot_bis.reset_index(inplace=True)
for cols in df_pivot_bis.columns:
   df_pivot_bis[cols] = df_pivot_bis[cols].replace({1.0: True})

df_pivot_bis['national_Additional'] = False
df_pivot_bis['national_Bridge'] = False
df_pivot_bis['national_Event'] = False
df_pivot_bis['national_Work Day'] = False

df_pivot = pd.concat([df_pivot, df_pivot_bis])
df_pivot.head()

  df_pivot[cols] = df_pivot[cols].replace({1.0: True})
  df_pivot_bis[cols] = df_pivot_bis[cols].replace({1.0: True})


Unnamed: 0,date,national_Additional,national_Bridge,national_Event,national_Holiday,national_Work Day
0,2012-08-10,False,False,False,True,False
1,2012-11-02,False,False,False,True,False
2,2012-11-03,False,False,False,True,False
3,2012-12-21,True,False,False,False,False
4,2012-12-22,True,False,False,False,False


In [106]:
df_pivot.date.max()

'2017-12-26'

In [107]:
_holidays_ = _holidays_.merge(df_pivot, how = "left")
_holidays_.fillna(False, inplace=True)
_holidays_

  _holidays_.fillna(False, inplace=True)


Unnamed: 0,date,store_nbr,is_impacted_by_local_holiday,is_impacted_by_regional_holiday,national_Additional,national_Bridge,national_Event,national_Holiday,national_Work Day
0,2012-03-02,52,True,False,False,False,False,False,False
1,2012-03-02,53,True,False,False,False,False,False,False
2,2012-04-01,12,False,True,False,False,False,False,False
3,2012-04-01,13,False,True,False,False,False,False,False
4,2012-04-12,37,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
341,2017-11-11,13,True,False,False,False,False,False,False
342,2017-11-12,23,True,False,False,False,False,False,False
343,2017-11-12,50,True,False,False,False,False,False,False
344,2017-12-08,38,True,False,False,False,False,False,False


In [108]:
_holidays_.columns

Index(['date', 'store_nbr', 'is_impacted_by_local_holiday',
       'is_impacted_by_regional_holiday', 'national_Additional',
       'national_Bridge', 'national_Event', 'national_Holiday',
       'national_Work Day'],
      dtype='object')

In [109]:
df_test_preprocessed = df_test.merge(_holidays_, how="left")
df_test_preprocessed[['is_impacted_by_local_holiday',
                            'is_impacted_by_regional_holiday', 'national_Additional',
                            'national_Bridge', 'national_Event', 'national_Holiday',
                            'national_Work Day']] = df_test_preprocessed[['is_impacted_by_local_holiday',
                                                    'is_impacted_by_regional_holiday', 'national_Additional',
                                                    'national_Bridge', 'national_Event', 'national_Holiday',
                                                    'national_Work Day']].fillna(False)

  'national_Work Day']].fillna(False)


In [110]:
df_test_preprocessed['oil_impact'] = 1

In [111]:
df_test_preprocessed = df_test_preprocessed.map(lambda x : 1 if x==True else x)
df_test_preprocessed = df_test_preprocessed.map(lambda x : 0 if x==False else x)

In [112]:
col_order = list(df_train_preprocess.columns)
col_order.remove("sales")
df_test_preprocessed = df_test_preprocessed[col_order]
df_test_preprocessed.to_csv('./data/preprocessed/df_test_preprocessed.csv', index = False)

In [113]:
df_test_preprocessed.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_impacted_by_local_holiday,oil_impact,is_impacted_by_regional_holiday,national_Additional,national_Bridge,national_Event,national_Holiday,national_Work Day
0,3000888,2017-08-16,1,AUTOMOTIVE,0,0,1,0,0,0,0,0,0
1,3000889,2017-08-16,1,BABY CARE,0,0,1,0,0,0,0,0,0
2,3000890,2017-08-16,1,BEAUTY,2,0,1,0,0,0,0,0,0
3,3000891,2017-08-16,1,BEVERAGES,20,0,1,0,0,0,0,0,0
4,3000892,2017-08-16,1,BOOKS,0,0,1,0,0,0,0,0,0


In [114]:
df_train_preprocess.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,is_impacted_by_local_holiday,oil_impact,is_impacted_by_regional_holiday,national_Additional,national_Bridge,national_Event,national_Holiday,national_Work Day
0,1782,2013-01-02,1,AUTOMOTIVE,2.0,0,0,0,0,0,0,0,0,0
1,1783,2013-01-02,1,BABY CARE,0.0,0,0,0,0,0,0,0,0,0
2,1784,2013-01-02,1,BEAUTY,2.0,0,0,0,0,0,0,0,0,0
3,1785,2013-01-02,1,BEVERAGES,1091.0,0,0,0,0,0,0,0,0,0
4,1786,2013-01-02,1,BOOKS,0.0,0,0,0,0,0,0,0,0,0


## training xgboost

In [115]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import cross_val_score
import numpy as np

In [116]:

# Separate the features and target
X = df_train_preprocess.drop(columns=['sales'])
y = df_train_preprocess['sales']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define RMSLE as the evaluation metric
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Set up the hyperparameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring=rmsle_scorer, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print best parameters and best score from Grid Search
print("Best parameters from Grid Search:", grid_search.best_params_)
print("Best RMSLE from Grid Search:", -grid_search.best_score_)

# Set up the hyperparameter distribution for Random Search
param_dist = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0.1, 0.5, 1.0, 1.5]
}

# Random Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, cv=3, scoring=rmsle_scorer, n_iter=50, n_jobs=-1, random_state=42, verbose=1)
random_search.fit(X_train, y_train)

# Print best parameters and best score from Random Search
print("Best parameters from Random Search:", random_search.best_params_)
print("Best RMSLE from Random Search:", -random_search.best_score_)

# Evaluate the best model from Random Search on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
test_rmsle = rmsle(y_test, y_pred)
print("Test RMSLE:", test_rmsle)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


ValueError: 
All the 729 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
729 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\sklearn.py", line 1081, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\sklearn.py", line 596, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
                    ^^^^^^^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\sklearn.py", line 1003, in _create_dmatrix
    return QuantileDMatrix(
           ^^^^^^^^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 1573, in __init__
    self._init(
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 1632, in _init
    it.reraise()
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 569, in reraise
    raise exc  # pylint: disable=raising-bad-type
    ^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 550, in _handle_exception
    return fn()
           ^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 637, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
                                          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\data.py", line 1388, in next
    input_data(**self.kwargs)
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 617, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
                                                   ^^^^^^^^^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\data.py", line 1431, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
                                       ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\data.py", line 603, in _transform_pandas_df
    pandas_check_dtypes(data, enable_categorical)
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\data.py", line 569, in pandas_check_dtypes
    _invalid_dataframe_dtype(data)
  File "c:\Users\oussa\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\data.py", line 356, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:date: object, family: object
