In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from colorama import Style, Fore
blk = Style.BRIGHT + Fore.BLACK
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL

import os
train = pd.read_csv('/kaggle/input/playground-series-s3e19/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e19/test.csv')
test['num_sold'] = float('nan') # this is dummy for simplify concat

train.columns

In [None]:
analysis = train
uniques = {}

for column in analysis.columns:
    uniques[column] = analysis[column].unique().tolist()
    if column not in ['date', 'num_sold','id']:
        print(uniques[column])

In [None]:
# To Datetime
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

df = pd.concat([train, test], axis=0)


In [None]:
le = LabelEncoder()
cols = ['country', 'store', 'product']
for col in cols:
    df[col] = le.fit_transform(df[col])

In [None]:
import datetime as dt

# date column is separated for each element
df['day']   = df['date'].dt.day
df['week' ] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['year']  = df['date'].dt.year

In [None]:
df.head()

In [None]:
def seasonality_features(df_temp):
    df_copy = df_temp.copy()  # Criar uma cópia do DataFrame
    df_copy['month_sin'] = np.sin(2*np.pi*df_copy.month/12)
    df_copy['month_cos'] = np.cos(2*np.pi*df_copy.month/12)
    df_copy['day_sin'] = np.sin(2*np.pi*df_copy.day/24)
    df_copy['day_cos'] = np.cos(2*np.pi*df_copy.day/24)
    return df_copy

df_modified = seasonality_features(df)

df_modified = df_modified.drop('date', axis=1)
df_modified = df_modified[df_modified['year'] != 2020]


In [None]:
def set_frame_style(df, caption=""):
    """Helper function to set dataframe presentation style.
    """
    return df.style.background_gradient(cmap='Blues').set_caption(caption).set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'Blue'),
        ('font-size', '18px'),
        ('font-weight','bold')
    ]}])


display(set_frame_style(df_modified.describe(),' Data : Summary Statistics'))

In [None]:
# Separar o conjunto de treinamento e o conjunto de teste após o encoding
df_train = df_modified.loc[~df_modified['num_sold'].isna()]
df_test = df_modified.loc[df_modified['num_sold'].isna()]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score


corr = df_train.corr(numeric_only=True)
target_corr = corr['num_sold'].drop('num_sold')

# Sort correlation values in descending order
target_corr_sorted = target_corr.sort_values(ascending=False)

# Create a heatmap of the correlations with the target column
sns.set(font_scale=0.8)
sns.set_style("white")
sns.set_palette("PuBuGn_d")
sns.heatmap(target_corr_sorted.to_frame(), cmap="coolwarm", annot=True, fmt='.2f')
plt.title('Correlation with Total Cup Points')
plt.show()

In [None]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit



def objective(trial, df):
    X = df.drop(['num_sold', 'id'], axis=1)
    y = df['num_sold']
    
    # Definir o número de splits desejados
    n_splits = 5

    # Criar o objeto TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # Variável para armazenar a soma das métricas de cada fold
    total_smape = 0

    # Loop pelos splits do TimeSeriesSplit
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Definir os hiperparâmetros a serem otimizados
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'random_state': 0
        }

        # Criar o modelo RandomForestRegressor com os hiperparâmetros definidos
        model = RandomForestRegressor(**params)

        # Treinar o modelo
        model.fit(X_train, y_train)

        # Prever os valores do conjunto de validação
        y_pred = model.predict(X_val)

        # Calcular a métrica de avaliação (SMAPE)
        smape = calculate_smape(y_val, y_pred)

        total_smape += smape

    # Calcular a média das métricas dos folds
    avg_smape = total_smape / n_splits

    return avg_smape



def calculate_smape(y_true, y_pred):
    return 1 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)) * 100)



In [None]:
def train_rf_regressor(df):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, df), n_trials=3)

    # Obter os melhores parâmetros encontrados
    best_params = study.best_params
    
    X = df.drop('num_sold', axis=1)
    y = df['num_sold']

    # Criar o objeto TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Lista para armazenar os modelos treinados
    models = []

    # Loop pelos splits do TimeSeriesSplit
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Treinar um novo modelo usando os melhores parâmetros
        model = RandomForestRegressor(**best_params)
        model.fit(X_train, y_train)

        models.append(model)

    return models


In [None]:
best_model = train_rf_regressor(df_train)

In [None]:
df_test = df_test.drop(['num_sold'], axis=1)

In [None]:
sample_submission = pd.read_csv('/kaggle/input/playground-series-s3e19/sample_submission.csv')

# Lista para armazenar as previsões de todos os modelos
all_predictions = []

# Loop pelos modelos treinados
for model in best_model:
    # Fazer previsões para o modelo atual
    predictions = model.predict(df_test)

    # Armazenar as previsões do modelo atual na lista
    all_predictions.append(predictions)

# Calcular a média das previsões de todos os modelos
mean_predictions = np.mean(all_predictions, axis=0)

# Substituir a segunda coluna do sample_submission pela média das previsões
sample_submission.iloc[:, 1] = mean_predictions

# Salvar o DataFrame em um arquivo CSV
sample_submission.to_csv('submission.csv', index=False)