In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import joblib

# Carregar os dados
df = pd.read_csv("Data/Dataframelimpa_sem_latlong.csv")

# Pré-processamento dos Dados
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# Função para extrair características de data/hora
def extract_datetime_features(df):
    df['purchase_weekday'] = df['order_purchase_timestamp'].dt.weekday
    df['purchase_month'] = df['order_purchase_timestamp'].dt.month
    df['purchase_hour'] = df['order_purchase_timestamp'].dt.hour
    df['approval_delay'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # em horas
    return df

df = extract_datetime_features(df)

# Variáveis preditoras e alvo
X = df[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay']]
y = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # Prever em horas

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['approval_delay']),
        ('cat', OneHotEncoder(), ['purchase_weekday', 'purchase_month', 'purchase_hour'])
    ])

# Pipeline completo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Treinar o modelo
model_pipeline.fit(X_train, y_train)

# Fazer previsões
y_pred = model_pipeline.predict(X_test)

# Avaliar o modelo
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

# Salvar o modelo para uso futuro
joblib.dump(model_pipeline, 'modelo_previsao_data_entrega.pkl')

# Exemplo de uso do modelo salvo em novos dados
modelo_carregado = joblib.load('modelo_previsao_data_entrega.pkl')


MSE: 20220.21515022603


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
from concurrent.futures import ThreadPoolExecutor

# Carregar os dados
df = pd.read_csv("Data/Dataframelimpa_sem_latlong.csv")

# Pré-processamento dos Dados
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# Função para extrair características de data/hora
def extract_datetime_features(df):
    df['purchase_weekday'] = df['order_purchase_timestamp'].dt.weekday
    df['purchase_month'] = df['order_purchase_timestamp'].dt.month
    df['purchase_hour'] = df['order_purchase_timestamp'].dt.hour
    df['approval_delay'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # em horas
    return df

# Calcular o volume do produto
df['product_volume'] = df['product_length_cm'] * df['product_height_cm'] * df['product_width_cm']

df = extract_datetime_features(df)

# Garantir que não haja valores NaN nos dados
df = df.fillna(0)

# Variáveis preditoras e alvo
X = df[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay', 'product_volume']]
y = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # Prever em horas

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['approval_delay', 'product_volume']),
        ('cat', OneHotEncoder(), ['purchase_weekday', 'purchase_month', 'purchase_hour'])
    ])

# Pipeline completo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# Usar ThreadPoolExecutor para treino do modelo
def train_model(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

with ThreadPoolExecutor() as executor:
    future = executor.submit(train_model, model_pipeline, X_train, y_train)
    future.result()  # Esperar a conclusão do treino

# Fazer previsões
y_pred = model_pipeline.predict(X_test)

# Avaliar o modelo
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'MSE: {mse}')
print(f'MAE: {mae}')

# Salvar o modelo para uso futuro
joblib.dump(model_pipeline, 'modelo_previsao_data_entrega.pkl')

# Exemplo de uso do modelo salvo em novos dados
modelo_carregado = joblib.load('modelo_previsao_data_entrega.pkl')


MSE: 18521.300670029792
MAE: 105.63585213014834


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
from concurrent.futures import ThreadPoolExecutor

# Carregar os dados
df = pd.read_csv("Data/Dataframelimpa_sem_latlong.csv")

# Pré-processamento dos Dados
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# Função para extrair características de data/hora
def extract_datetime_features(df):
    df['purchase_weekday'] = df['order_purchase_timestamp'].dt.weekday
    df['purchase_month'] = df['order_purchase_timestamp'].dt.month
    df['purchase_hour'] = df['order_purchase_timestamp'].dt.hour
    df['approval_delay'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # em horas
    return df

# Calcular o volume do produto
df['product_volume'] = df['product_length_cm'] * df['product_height_cm'] * df['product_width_cm']

df = extract_datetime_features(df)

# Garantir que não haja valores NaN nos dados
df = df.fillna(0)

# Variáveis preditoras e alvo
X = df[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay', 'product_volume', 'product_weight_g']]
y = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # Prever em horas

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['approval_delay', 'product_volume', 'product_weight_g']),
        ('cat', OneHotEncoder(), ['purchase_weekday', 'purchase_month', 'purchase_hour'])
    ])

# Pipeline completo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# Usar ThreadPoolExecutor para treino do modelo
def train_model(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

with ThreadPoolExecutor() as executor:
    future = executor.submit(train_model, model_pipeline, X_train, y_train)
    future.result()  # Esperar a conclusão do treino

# Fazer previsões
y_pred = model_pipeline.predict(X_test)

# Avaliar o modelo
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'MSE: {mse}')
print(f'MAE: {mae}')

# Salvar o modelo para uso futuro
joblib.dump(model_pipeline, 'modelo_previsao_data_entrega.pkl')

# Exemplo de uso do modelo salvo em novos dados
modelo_carregado = joblib.load('modelo_previsao_data_entrega.pkl')


MSE: 18136.803027188955
MAE: 104.82506590254518


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
from concurrent.futures import ThreadPoolExecutor

# Carregar os dados
df = pd.read_csv("Data/Dataframelimpa_sem_latlong.csv")

# Pré-processamento dos Dados
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# Função para extrair características de data/hora
def extract_datetime_features(df):
    df['purchase_weekday'] = df['order_purchase_timestamp'].dt.weekday
    df['purchase_month'] = df['order_purchase_timestamp'].dt.month
    df['purchase_hour'] = df['order_purchase_timestamp'].dt.hour
    df['approval_delay'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # em horas
    return df

# Função para calcular a similaridade dos prefixos de CEP
def calculate_zip_similarity(df):
    def compare_zip_codes(row):
        seller_zip = str(row['seller_zip_code_prefix'])
        customer_zip = str(row['customer_zip_code_prefix'])
        similarity = sum(1 for s, c in zip(seller_zip, customer_zip) if s == c)
        return similarity

    df['zip_code_similarity'] = df.apply(compare_zip_codes, axis=1)
    return df

# Calcular o volume do produto
df['product_volume'] = df['product_length_cm'] * df['product_height_cm'] * df['product_width_cm']

df = extract_datetime_features(df)
df = calculate_zip_similarity(df)

# Garantir que não haja valores NaN nos dados
df = df.fillna(0)

# Variáveis preditoras e alvo
X = df[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay', 'product_volume', 'product_weight_g', 'zip_code_similarity']]
y = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # Prever em horas

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['approval_delay', 'product_volume', 'product_weight_g', 'zip_code_similarity']),
        ('cat', OneHotEncoder(), ['purchase_weekday', 'purchase_month', 'purchase_hour'])
    ])

# Pipeline completo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# Usar ThreadPoolExecutor para treino do modelo
def train_model(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

with ThreadPoolExecutor() as executor:
    future = executor.submit(train_model, model_pipeline, X_train, y_train)
    future.result()  # Esperar a conclusão do treino

# Fazer previsões
y_pred = model_pipeline.predict(X_test)

# Avaliar o modelo
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'MSE: {mse}')
print(f'MAE: {mae}')

# Salvar o modelo para uso futuro
joblib.dump(model_pipeline, 'modelo_previsao_data_entrega.pkl')

# Exemplo de uso do modelo salvo em novos dados
modelo_carregado = joblib.load('modelo_previsao_data_entrega.pkl')


MSE: 18007.406524008937
MAE: 104.4635009544379


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
from concurrent.futures import ThreadPoolExecutor

# Carregar os dados
df = pd.read_csv("Data/DataFrame_final_com_juncao.csv")

# Pré-processamento dos Dados
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# Função para extrair características de data/hora
def extract_datetime_features(df):
    df['purchase_weekday'] = df['order_purchase_timestamp'].dt.weekday
    df['purchase_month'] = df['order_purchase_timestamp'].dt.month
    df['purchase_hour'] = df['order_purchase_timestamp'].dt.hour
    df['approval_delay'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # em horas
    return df

# Função para calcular a similaridade dos prefixos de CEP
def calculate_zip_similarity(df):
    def compare_zip_codes(row):
        seller_zip = str(row['seller_zip_code_prefix'])
        customer_zip = str(row['customer_zip_code_prefix'])
        similarity = sum(1 for s, c in zip(seller_zip, customer_zip) if s == c)
        return similarity

    df['zip_code_similarity'] = df.apply(compare_zip_codes, axis=1)
    return df

# Calcular o volume do produto
df['product_volume'] = df['product_length_cm'] * df['product_height_cm'] * df['product_width_cm']

# Adicionar a variável de frete (freight_value)
df['freight_value'] = df['freight_value']

df = extract_datetime_features(df)
df = calculate_zip_similarity(df)

# Garantir que não haja valores NaN nos dados
df = df.fillna(0)

# Variáveis preditoras e alvo
X = df[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay', 'product_volume', 'product_weight_g', 'zip_code_similarity', 'freight_value']]
y = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # Prever em horas

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['approval_delay', 'product_volume', 'product_weight_g', 'zip_code_similarity', 'freight_value']),
        ('cat', OneHotEncoder(), ['purchase_weekday', 'purchase_month', 'purchase_hour'])
    ])

# Pipeline completo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# Usar ThreadPoolExecutor para treino do modelo
def train_model(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

with ThreadPoolExecutor() as executor:
    future = executor.submit(train_model, model_pipeline, X_train, y_train)
    future.result()  # Esperar a conclusão do treino

# Fazer previsões
y_pred = model_pipeline.predict(X_test)

# Avaliar o modelo
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'MSE: {mse}')
print(f'MAE: {mae}')

# Salvar o modelo para uso futuro
joblib.dump(model_pipeline, 'modelo_previsao_data_entrega.pkl')

# Exemplo de uso do modelo salvo em novos dados
modelo_carregado = joblib.load('modelo_previsao_data_entrega.pkl')


MSE: 12779.349752200991
MAE: 85.40084706472277


In [17]:
import pandas as pd

# Carregar a DataFrame principal e a DataFrame order_items
df_principal = pd.read_csv("Data/Dataframelimpa_sem_latlong.csv")
df_order_items = pd.read_csv("Data/order_items.csv")

# Corrigir zip codes
df_principal['seller_zip_code_prefix'] = df_principal['seller_zip_code_prefix'].astype(str).apply(lambda x: x.zfill(5) if len(x) == 4 else x)
df_principal['customer_zip_code_prefix'] = df_principal['customer_zip_code_prefix'].astype(str).apply(lambda x: x.zfill(5) if len(x) == 4 else x)

# Remover linhas duplicadas da DataFrame order_items com base na coluna order_id
df_order_items_clean = df_order_items.drop_duplicates(subset=['order_id'])

# Realizar o merge usando pandas
df_final = pd.merge(df_principal, df_order_items_clean, on='order_id', how='left')

# Verificar o tamanho da DataFrame final
print(f"Número de linhas na DataFrame final: {len(df_final)}")

df_final.to_csv("Data/DataFrame_final_com_juncao.csv", index=False)


Número de linhas na DataFrame final: 59352


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from concurrent.futures import ThreadPoolExecutor
import joblib

# Carregar os dados
df = pd.read_csv("Data/Dataframelimpa_sem_latlong.csv")

df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# Função para extrair características de data/hora
def extract_datetime_features(df):
    df['purchase_weekday'] = df['order_purchase_timestamp'].dt.weekday
    df['purchase_month'] = df['order_purchase_timestamp'].dt.month
    df['purchase_hour'] = df['order_purchase_timestamp'].dt.hour
    df['approval_delay'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # em horas
    return df

df = extract_datetime_features(df)

# Variáveis preditoras e alvo
X = df[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay']]
y = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # Prever em horas

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['approval_delay']),
        ('cat', OneHotEncoder(), ['purchase_weekday', 'purchase_month', 'purchase_hour'])
    ])

# Pipeline completo
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# GridSearchCV para encontrar os melhores hiperparâmetros
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Função para treinar um modelo com um conjunto de parâmetros
def train_model(params):
    model_pipeline.set_params(**params)
    model_pipeline.fit(X_train, y_train)
    return model_pipeline

# Usar ThreadPoolExecutor para paralelizar GridSearchCV
with ThreadPoolExecutor() as executor:
    grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Fazer previsões
y_pred = best_model.predict(X_test)

# Avaliar o modelo
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

# Salvar o modelo para uso futuro
joblib.dump(best_model, 'modelo_previsao_data_entrega.pkl')

# Exemplo de uso do modelo salvo em novos dados
modelo_carregado = joblib.load('modelo_previsao_data_entrega.pkl')


MSE: 18843.643010133725


In [None]:
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])

df_novo = extract_datetime_features(df)

X_novo = df_novo[['purchase_weekday', 'purchase_month', 'purchase_hour', 'approval_delay']]
previsoes = modelo_carregado.predict(X_novo)

df_novo['previsao_data_entrega_horas'] = previsoes
df_novo['previsao_data_entrega'] = df_novo['order_purchase_timestamp'] + pd.to_timedelta(df_novo['previsao_data_entrega_horas'], unit='h')

print(df_novo)