In [1]:
# imports
# manipulação de dados
import pandas as pd
import numpy as np

# visualização de dados
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# configuração de warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Import the necessary modules
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.tree import DecisionTreeRegressor
# Create three individual models
model1 = RandomForestRegressor()
model2 = GradientBoostingRegressor()
model3 = DecisionTreeRegressor()

# Define the ensemble model
ensemble_model = VotingRegressor(estimators=[
    ('rf', model1), ('gb', model2), ('dt', model3)
])



In [5]:
# leitura da base
df = pd.read_csv('Walmart.csv')

In [6]:
# converte a coluna Date para o formato datetime
df['Date'] = pd.to_datetime(df.Date, format='%d-%m-%Y')

In [7]:
# ordena a base de dados pelas colunas de Data e Vendas
df = (
    df
     .sort_values(
        by=['Date', 'Weekly_Sales'],
        ascending=True)
     .reset_index(drop=True)
)

In [8]:
# cria colunas a partir da data
df['month'] = df.Date.dt.month
df['year'] = df.Date.dt.year

In [9]:
import datetime
# Função para converter data para integer
def ajustar_data( Data ):
    """
    Função para converter Data em Integer
    """
    data_formatada = int(Data.strftime("%d%m%Y"))
    
    return data_formatada

In [10]:
df['Date'] = df['Date'].apply( ajustar_data )

In [11]:
df.drop(columns='Fuel_Price', axis=1, inplace=True)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
y = df['Weekly_Sales']
X = df.drop(columns=['Weekly_Sales'], axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, test_size=0.3)


In [17]:
# Train the ensemble model
ensemble_model.fit(X_train, y_train)




In [21]:
# Make predictions with the ensemble model
y_pred_test_model = ensemble_model.predict(X_test)
# previsão com os dados de treino
y_pred_train_model = ensemble_model.predict(X_train)


In [22]:
ensemble_model.estimators

[('rf', RandomForestRegressor()),
 ('gb', GradientBoostingRegressor()),
 ('dt', DecisionTreeRegressor())]

In [23]:
from numpy import sqrt

In [24]:
# define uma função para calcular o rmse
def rmse(y_true, y_pred):
    "Calcula o RMSE."
    return round(sqrt(mean_squared_error(y_true, y_pred)), 2)



In [25]:
# métrica nos dados de treino
rmse(y_train, y_pred_train_model)

# métrica nos dados de teste
rmse(y_test, y_pred_test_model)


122682.86

In [27]:

# métrica nos dados de treino
ensemble_model.score(X_train, y_train)


0.9863555059147205

In [28]:
# Score dados de teste
ensemble_model.score(X_test,y_test)

0.9542112985670792