In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import joblib

# Carregar os dados
df = pd.read_csv("../Data/Dataframelimpa_sem_latlong.csv")

# Pré-processamento dos Dados
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# Função para extrair características de data/hora
def extract_datetime_features(df):
    df['purchase_weekday'] = df['order_purchase_timestamp'].dt.weekday
    df['purchase_month'] = df['order_purchase_timestamp'].dt.month
    df['purchase_hour'] = df['order_purchase_timestamp'].dt.hour
    df['approval_delay'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # em horas
    return df

df = extract_datetime_features(df)

# Variáveis preditoras e alvo
X = df[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay']]
y = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # Prever em horas

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['approval_delay']),
        ('cat', OneHotEncoder(), ['purchase_weekday', 'purchase_month', 'purchase_hour'])
    ])

# Pipeline completo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Treinar o modelo
model_pipeline.fit(X_train, y_train)

# Fazer previsões
y_pred = model_pipeline.predict(X_test)

# Avaliar o modelo
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

# Salvar o modelo para uso futuro
joblib.dump(model_pipeline, 'modelo_previsao_data_entrega.pkl')

# Exemplo de uso do modelo salvo em novos dados
modelo_carregado = joblib.load('modelo_previsao_data_entrega.pkl')


MSE: 20220.21515022603


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from concurrent.futures import ThreadPoolExecutor
import joblib

# Carregar os dados
df = pd.read_csv("Data/Dataframelimpa_sem_latlong.csv")

df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# Função para extrair características de data/hora
def extract_datetime_features(df):
    df['purchase_weekday'] = df['order_purchase_timestamp'].dt.weekday
    df['purchase_month'] = df['order_purchase_timestamp'].dt.month
    df['purchase_hour'] = df['order_purchase_timestamp'].dt.hour
    df['approval_delay'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # em horas
    return df

df = extract_datetime_features(df)

# Variáveis preditoras e alvo
X = df[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay']]
y = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # Prever em horas

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['approval_delay']),
        ('cat', OneHotEncoder(), ['purchase_weekday', 'purchase_month', 'purchase_hour'])
    ])

# Pipeline completo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# GridSearchCV para encontrar os melhores hiperparâmetros
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Função para treinar um modelo com um conjunto de parâmetros
def train_model(params):
    model_pipeline.set_params(**params)
    model_pipeline.fit(X_train, y_train)
    return model_pipeline

# Usar ThreadPoolExecutor para paralelizar GridSearchCV
with ThreadPoolExecutor() as executor:
    grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Fazer previsões
y_pred = best_model.predict(X_test)

# Avaliar o modelo
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

# Salvar o modelo para uso futuro
joblib.dump(best_model, 'modelo_previsao_data_entrega.pkl')

# Exemplo de uso do modelo salvo em novos dados
modelo_carregado = joblib.load('modelo_previsao_data_entrega.pkl')


In [4]:
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])

df_novo = extract_datetime_features(df)

X_novo = df_novo[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay']]
previsoes = modelo_carregado.predict(X_novo)

df_novo['previsao_data_entrega_horas'] = previsoes
df_novo['previsao_data_entrega'] = df_novo['order_purchase_timestamp'] + pd.to_timedelta(df_novo['previsao_data_entrega_horas'], unit='h')

print(df_novo)

                               order_id                       customer_id  \
0      00010242fe8c5a6d1ba2dd792cb16214  3ce436f183e68e07877b285a838db11a   
1      000229ec398224ef6ca0657da4fc703e  6489ae5e4333f3693df5ad4372dab6d3   
2      00048cc3ae777c65dbb7d2a0634bc1ea  816cbea969fe5b689b39cfc97a506742   
3      00054e8431b9d7675808bcb819fb4a32  32e2e6ab09e778d99bf2e0ecd4898718   
4      0005a1a1728c9d785b8e2b08b904576c  16150771dfd4776261284213b89c304e   
...                                 ...                               ...   
59347  fff8287bbae429a99bb7e8c21d151c41  6c1e92a209dbf868706caa831090941e   
59348  fff90cdcb3b2e6cfb397d05d562fd3fe  f6cc7b845fde9d4e71361fe6fcd7ef75   
59349  fffa82886406ccf10c7b4e35c4ff2788  a5201e1a6d71a8d21e869151bd5b4085   
59350  fffce4705a9662cd70adb13d4a31832d  29309aa813182aaddc9b259e31b870e6   
59351  fffe18544ffabc95dfada21779c9644f  b5e6afd5a41800fdf401e0272ca74655   

      order_status order_purchase_timestamp  Pedido em aprovação  \
0      