In [388]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
from xgboost import XGBRegressor

import scipy as scp
import scipy.stats as stats

import optuna
from optuna.integration import OptunaSearchCV

from datetime import datetime, timedelta
import calendar

## Functions

In [389]:
def root_mean_squared_error(y_true, y_pred):
    """
    :param y_true: a list of true target values
    :param y_pred: a list of predicted target values
    :return: rmse
    """
    mse = ((y_true - y_pred) ** 2).mean()
    rmse = np.sqrt(mse)
    return rmse

def random_forest(x_train, x_test, y_train, params: dict = None):

    # Model #
    params ['criterion'] = 'squared_error'
    params ['random_state'] = 0

    RF_model = RandomForestRegressor(**params) 

    # Fit #
    RF_model.fit(x_train,y_train)

    # Predict #
    y_pred = RF_model.predict(x_test)

    return y_pred

def xgboost(x_train, x_test, y_train, params: dict = None):
        
    # Model #
    params['objective'] = 'reg:squarederror'

    XGB_model = XGBRegressor(**params)
    
    # Fit #
    XGB_model.fit(x_train, y_train)

    # Predict #
    y_pred = XGB_model.predict(x_test)

    return y_pred

def optuna_search(X_train: pd.DataFrame, y_train: pd.Series, estimator: str = 'RF'):

    if estimator not in ['RF', 'XGB']:
        raise ValueError("Estimator must be 'RF' or 'XGB'.")
    
    k_folds = 10
    n_trials = 5
    
    if estimator == 'RF':
        estimator = RandomForestRegressor(criterion='squared_error', random_state=0)
        param_distributions = {
            'n_estimators': optuna.distributions.IntDistribution(10, 250),
            'max_features': optuna.distributions.IntDistribution(1, len(INPUTS)),
            'max_depth': optuna.distributions.IntDistribution(1, 20),
            'min_samples_split': optuna.distributions.IntDistribution(2, 10),
            'min_samples_leaf':optuna.distributions.IntDistribution(1, 10),
            'min_impurity_decrease': optuna.distributions.FloatDistribution(0.001, 0.1)
            }
        rf_opt = OptunaSearchCV(estimator, param_distributions=param_distributions, n_trials = n_trials, cv=k_folds, verbose=False)
        rf_opt.fit(X_train, y_train)
        return rf_opt.best_params_, rf_opt.best_score_
    
    else:

        estimator = XGBRegressor(objective = 'reg:squarederror', random_state=0)
        param_distributions = {
            'n_estimators': optuna.distributions.IntDistribution(10, 250),
            'learning_rate': optuna.distributions.FloatDistribution(0.001, 1),
            'max_depth': optuna.distributions.IntDistribution(1, 5),
            'min_split_loss': optuna.distributions.FloatDistribution(0.001, 1)
        }
        xgb_opt = OptunaSearchCV(estimator, param_distributions=param_distributions, n_trials = n_trials, cv=k_folds, verbose=False)
        xgb_opt.fit(X_train, y_train)
        return xgb_opt.best_params_, xgb_opt.best_score_

## Preprocess

#### EDA

In [390]:
# Read data
df = pd.read_csv('C:/Users\/juanm/Escritorio/Juanmi/DataFest IKEA/Team 09. ImportBilly/1. Data/visits_train.csv')
df.head()

Unnamed: 0,location,date_visit,visits
0,Madrid,28-02-2023,67.32372
1,Madrid,27-02-2023,71.89443
2,Madrid,26-02-2023,87.80381
3,Madrid,25-02-2023,90.435243
4,Madrid,24-02-2023,76.281122


In [391]:
df['date_visit'] = pd.to_datetime(df['date_visit'], format='%d-%m-%Y')

In [392]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   location    1338 non-null   object        
 1   date_visit  1338 non-null   datetime64[ns]
 2   visits      1338 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 31.5+ KB


In [393]:
df.describe()

Unnamed: 0,visits
count,1338.0
mean,58.837301
std,16.243328
min,0.0
25%,50.429277
50%,59.213881
75%,67.605966
max,101.0


In [394]:
miss_data = pd.isnull(df).values.any()
print("Hay información perdida: ", miss_data, "\n")

sum_miss_data = pd.isnull(df).sum()
print("Hay {} valores perdidos".format(sum_miss_data))

Hay información perdida:  False 

Hay location      0
date_visit    0
visits        0
dtype: int64 valores perdidos


In [395]:
# Duplicates
df.duplicated().value_counts()

False    1338
dtype: int64

In [396]:
# Porcentaje de ceros
variables = ['location', 'date_visit', 'visits']
for i in variables:
    df_count=df.loc[df[i] == 0]
    x=(df_count[i].count()/df[i].count())*100
    x=round(x,2)
    print(f'The Number of zero values in column {i} is {x}')

The Number of zero values in column location is 0.0
The Number of zero values in column date_visit is 0.0
The Number of zero values in column visits is 2.62


In [397]:
# Vemos los días que no hay visitas
df[df.visits < 1]

Unnamed: 0,location,date_visit,visits
53,Madrid,2023-01-06,0.0
734,Barcelona,2022-12-25,0.0
783,Barcelona,2022-11-06,0.0
804,Barcelona,2022-10-16,0.0
818,Barcelona,2022-10-02,0.0
888,Barcelona,2022-07-24,0.0
895,Barcelona,2022-07-17,0.0
902,Barcelona,2022-07-10,0.0
909,Barcelona,2022-07-03,0.0
918,Barcelona,2022-06-24,0.0


# Forecast

In [398]:
df_mad = df[df['location']=='Madrid'].drop(['location'], axis=1).reset_index(drop=True)

In [399]:
df_bar = df[df['location']=='Barcelona'].drop(['location'], axis=1).reset_index(drop=True)

In [400]:
# Index
df_mad = df_mad.set_index('date_visit').sort_index(ascending=True)
df_bar = df_bar.set_index('date_visit').sort_index(ascending=True)

In [401]:
df_mad

Unnamed: 0_level_0,visits
date_visit,Unnamed: 1_level_1
2021-05-01,29.393306
2021-05-02,78.272126
2021-05-03,73.994576
2021-05-04,48.882127
2021-05-05,45.836089
...,...
2023-02-24,76.281122
2023-02-25,90.435243
2023-02-26,87.803810
2023-02-27,71.894430


In [402]:
# Crea un rango de fechas para marzo de 2023
start_date = datetime(2023, 3, 1)
end_date = datetime(2023, 4, 1) - timedelta(days=1)
date_range = pd.date_range(start_date, end_date)

# Convierte el rango de fechas en una lista de cadenas con formato 'yyyy-mm-dd'
df_marzo = pd.DataFrame([date.strftime('%Y-%m-%d') for date in date_range])
df_marzo = df_marzo.set_index(0)

In [403]:
df_mad = pd.concat([df_mad, df_marzo], axis=1)
df_bar = pd.concat([df_bar, df_marzo], axis=1)

In [404]:
df_mad = df_mad.reset_index(drop=False, names=['date_visit'])
df_bar = df_bar.reset_index(drop=False, names=['date_visit'])

  df_mad = df_mad.reset_index(drop=False, names=['date_visit'])
  df_bar = df_bar.reset_index(drop=False, names=['date_visit'])


### Opt. Parámetros

In [405]:
# Lags
df_mad_aux = df_mad.copy()
df_bar_aux = df_bar.copy()
for i in range(1, 31):
    df_mad_aux[f"lag_{i}"] = df_mad["visits"].shift(i)
    df_bar_aux[f"lag_{i}"] = df_bar["visits"].shift(i)
df_mad_aux = df_mad_aux.dropna().set_index('date_visit')
df_bar_aux = df_bar_aux.dropna().set_index('date_visit')

In [406]:
INPUTS = df_bar_aux.columns.drop('visits')
OUTPUT = 'visits'

* Barcelona

In [407]:
x_train = df_bar_aux[INPUTS]
y_train = df_bar_aux[OUTPUT]

In [408]:
### Optuna General ###
rf_params_bar, rf_scorer = optuna_search(x_train, y_train, 'RF')
xgb_params_bar, xgb_scorer = optuna_search(x_train, y_train, 'XGB')

  rf_opt = OptunaSearchCV(estimator, param_distributions=param_distributions, n_trials = n_trials, cv=k_folds, verbose=False)
[32m[I 2023-04-16 11:20:21,234][0m A new study created in memory with name: no-name-1612ffd5-cc31-40a1-9f47-65bb69fd7859[0m
[32m[I 2023-04-16 11:20:26,724][0m Trial 0 finished with value: 0.49788755351854486 and parameters: {'n_estimators': 249, 'max_features': 15, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 8, 'min_impurity_decrease': 0.08474336364982674}. Best is trial 0 with value: 0.49788755351854486.[0m
[32m[I 2023-04-16 11:20:29,564][0m Trial 1 finished with value: 0.5035013048953262 and parameters: {'n_estimators': 57, 'max_features': 25, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 5, 'min_impurity_decrease': 0.003308679060639942}. Best is trial 1 with value: 0.5035013048953262.[0m
[32m[I 2023-04-16 11:20:34,942][0m Trial 2 finished with value: 0.5047601191622961 and parameters: {'n_estimators': 129, 'max_featur

* Madrid

In [409]:
x_train = df_mad_aux[INPUTS]
y_train = df_mad_aux[OUTPUT]

In [410]:
### Optuna General ###
rf_params_mad, rf_scorer = optuna_search(x_train, y_train, 'RF')
xgb_params_mad, xgb_scorer = optuna_search(x_train, y_train, 'XGB')

  rf_opt = OptunaSearchCV(estimator, param_distributions=param_distributions, n_trials = n_trials, cv=k_folds, verbose=False)
[32m[I 2023-04-16 11:20:41,077][0m A new study created in memory with name: no-name-6077912e-3e1d-4be3-9661-e711175176fe[0m
[32m[I 2023-04-16 11:20:42,030][0m Trial 0 finished with value: 0.42993529028034666 and parameters: {'n_estimators': 79, 'max_features': 5, 'max_depth': 3, 'min_samples_split': 10, 'min_samples_leaf': 3, 'min_impurity_decrease': 0.08261487147859495}. Best is trial 0 with value: 0.42993529028034666.[0m
[32m[I 2023-04-16 11:20:48,607][0m Trial 1 finished with value: 0.5222042884683206 and parameters: {'n_estimators': 233, 'max_features': 11, 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.034855654826056064}. Best is trial 1 with value: 0.5222042884683206.[0m
[32m[I 2023-04-16 11:20:52,965][0m Trial 2 finished with value: 0.5597490699841059 and parameters: {'n_estimators': 90, 'max_features

### Barcelona

In [411]:
df_bar = df_bar.set_index('date_visit')

In [412]:
df_rf = df_bar.copy()
df_rf.index = pd.to_datetime(df_rf.index)
df_xgb = df_bar.copy()
df_xgb.index = pd.to_datetime(df_xgb.index)
for i in range(len(df_marzo)):
    date = df_marzo.index[i+1]
    df_aux_rf = df_rf.loc[:date]
    df_aux_xgb = df_xgb.loc[:date]
    # Lags
    for i in range(1, 31):
        df_aux_rf[f"lag_{i}"] = df_aux_rf["visits"].shift(i)
        df_aux_xgb[f"lag_{i}"] = df_aux_xgb["visits"].shift(i)
    df_aux_rf = df_aux_rf.dropna()
    df_aux_xgb = df_aux_xgb.dropna()
    x_train_rf = df_aux_rf.iloc[:-1][INPUTS]
    y_train_rf = df_aux_rf.iloc[:-1][OUTPUT]
    x_test_rf = df_aux_rf.loc[date][INPUTS]
    x_train_xgb = df_aux_xgb.iloc[:-1][INPUTS]
    y_train_xgb = df_aux_xgb.iloc[:-1][OUTPUT]
    x_test_xgb = df_aux_xgb.loc[date][INPUTS]
    print(x_train_rf.head())
    print(y_train_rf.head())
    print(x_test_rf.head())
    y_pred_rf = random_forest(x_train_rf, pd.Series(x_test_rf), y_train_rf, rf_params_bar)
    y_pred_xgb = xgboost(x_train_xgb, pd.Series(x_test_xgb), y_train_xgb, rf_params_bar)
    df_rf[df_rf['date_visit'] == date]['visits'] = y_pred_rf
    df_xgb[df_xgb['date_visit'] == date]['visits'] = y_pred_xgb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aux_rf[f"lag_{i}"] = df_aux_rf["visits"].shift(i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aux_xgb[f"lag_{i}"] = df_aux_xgb["visits"].shift(i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aux_rf[f"lag_{i}"] = df_aux_rf["visits"].shift(i)
A value is trying to be set on a copy of a s

KeyError: '2023-03-02'

In [None]:
y_pred_rf = random_forest(x_train, pd.Series(x_test), y_train, rf_params)
y_pred_xgb = xgboost(x_train, pd.Series(x_test), y_train, xgb_params)

In [None]:
y_pred_xgb

### Madrid

In [None]:
y_pred_rf = random_forest(x_train, x_test, y_train, rf_params)
y_pred_xgb = xgboost(x_train, x_test, y_train, xgb_params)

In [None]:
y_pred_xgb