In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# Removendo linhas duplicadas

class RemoveDuplicates(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        print("Removendo duplicatas do DataFrame")
        linhas_inicio = X.shape[0]
        X_copy = X.copy()
        X_copy.drop_duplicates(inplace=True)
        linhas_fim = X_copy.shape[0]
        print(f"Tratamento de duplicatas concluído. Foram removidas {linhas_inicio - linhas_fim} linhas.")
        print()
        return X_copy

In [4]:
# Removendo linhas em que o pedido foi cancelado ou que possuem produtos sem dimensão

class RemoveInvalidRows(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_check = ["order_approved_at", "product_height_cm"]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        initial_rows = X.shape[0]
        X_copy = X.dropna(subset=self.columns_to_check)
        removed_rows = initial_rows - X_copy.shape[0]
        print(f"Removidas {removed_rows} linhas devido a valores nulos nas colunas: {', '.join(self.columns_to_check)}.")
        print()
        return X_copy


In [5]:
# Removendo colunas que não fornecem informações valiosas para o nosso modelo 
## ("'order_id.1', 'customer_id.1', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty")

# Removendo colunas de localização, pois possuem muitos dados faltantes, sendo que já temos as informações de zip code, suficientes para o nosso modelo
## ('customer_city', 'customer_state', 'seller_city', 'seller_state')

class CleanColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        columns_to_remove = [
            'order_id.1', 'customer_id.1', 'product_name_lenght', 
            'product_description_lenght', 'product_photos_qty',
            'customer_city', 'customer_state', 'seller_city', 'seller_state'
        ]
        print("Removendo colunas específicas do DataFrame")
        X_copy = X.copy()
        X_copy.drop(columns=columns_to_remove, inplace=True)
        print(f"Colunas removidas: {columns_to_remove}")
        print()
        return X_copy

In [6]:
# Convertendo colunas de data para a formatação DateTime

class ConvertToDateTime(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = [
            "order_purchase_timestamp",
            "order_approved_at",
            "order_delivered_carrier_date",
            "order_delivered_customer_date"
        ]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for column in self.columns:
            if column in X_copy.columns:
                print(f"Convertendo coluna {column} para datetime")
                X_copy[column] = pd.to_datetime(X_copy[column], errors='coerce')
        print()
        return X_copy

In [7]:
# Criando novas colunas para analisar o intervalo de tempo dedicado a cada etapa do processo

class AddTimeAnalysisColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()

        # Calcular tempo de pedido em aprovação
        PEDIDO_APROVACAO = X_copy["order_approved_at"] - X_copy["order_purchase_timestamp"]
        PA2 = [(elem.total_seconds() / (24 * 3600)) if pd.notnull(elem) else None for elem in PEDIDO_APROVACAO]
        PA2 = [round(elem, 2) if elem is not None else None for elem in PA2]

        # Calcular tempo de transporte do pedido
        PEDIDO_EM_TRANSPORTE = X_copy["order_delivered_carrier_date"] - X_copy["order_approved_at"]
        PT2 = [(elem.total_seconds() / (24 * 3600)) if pd.notnull(elem) else None for elem in PEDIDO_EM_TRANSPORTE]
        PT2 = [round(elem, 2) if elem is not None else None for elem in PT2]

        # Calcular diferença entre o momento da entrega final pela transportadora até a confirmação do comprador
        ENTREGA_CONFIRMACAO = X_copy["order_delivered_customer_date"] - X_copy["order_delivered_carrier_date"]
        EC2 = [(elem.total_seconds() / (24 * 3600)) if pd.notnull(elem) else None for elem in ENTREGA_CONFIRMACAO]
        EC2 = [round(elem, 2) if elem is not None else None for elem in EC2]

        # Inserir novas colunas no DataFrame
        X_copy.insert(4, "Pedido em aprovação", PA2)
        X_copy.insert(6, "Pedido em transporte", PT2)
        X_copy.insert(8, "Diferença entre entrega e confirmação", EC2)

        return X_copy

In [8]:
# Removendo os casos em que o intervalo de tempo de determinada etapa é negativo

class RemoveNegatives(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_treat = ["Pedido em aprovação", "Pedido em transporte", "Diferença entre entrega e confirmação"]
        self.linhas_removidas = {}
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for coluna in self.columns_to_treat:
            linhas_inicio = X_copy.shape[0]
            X_copy = X_copy[X_copy[coluna] >= 0]  
            self.linhas_removidas[coluna] = linhas_inicio - X_copy.shape[0]
        
        for coluna, num_linhas in self.linhas_removidas.items():
            print(f"Tratamento de números negativos concluído. Foram removidas {num_linhas} linhas devido à coluna '{coluna}'.")
        print()

        return X_copy

In [9]:
# Retirando outliers superiores

class OutliersTreatment(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_treat = ["delivery_time", "Pedido em aprovação", "Pedido em transporte", "Diferença entre entrega e confirmação"]
        self.outliers_limits = {}
        self.removed_lines = {}
    
    def fit(self, X, y=None):
        for coluna in self.columns_to_treat:
            q1 = np.percentile(X[coluna], 25)
            q3 = np.percentile(X[coluna], 75)
            iqr = q3 - q1
            out_sup = q3 + iqr * 1.5
            out_inf = q1 - iqr * 1.5
            self.outliers_limits[coluna] = (out_inf, out_sup)
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        self.removed_lines = {}
        for coluna in self.columns_to_treat:
            out_inf, out_sup = self.outliers_limits[coluna]
            initial_rows = X_copy.shape[0]
            X_copy = X_copy[X_copy[coluna] < out_sup]
            removed = initial_rows - X_copy.shape[0]
            self.removed_lines[coluna] = removed
            print(f"Tratamento de outliers para coluna '{coluna}' concluído. Foram removidas {removed} linhas.")
        print()
        return X_copy


In [10]:
# Agrupando categorias de produtos 

class AgruparCategorias(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorias_a_agrupar = {
            'construcao_ferramentas_ferramentas': 'construcao_ferramentas',
            'construcao_ferramentas_construcao': 'construcao_ferramentas',
            'construcao_ferramentas_jardim': 'construcao_ferramentas',
            'construcao_ferramentas_iluminacao': 'construcao_ferramentas',
            'construcao_ferramentas_seguranca': 'construcao_ferramentas',
            'ferramentas_jardim': 'construcao_ferramentas',

            'moveis_sala': 'moveis',
            'moveis_quarto': 'moveis',
            'moveis_colchao_e_estofado': 'moveis',
            'moveis_cozinha_area_de_servico_jantar_e_jardim': 'moveis',
            'moveis_decoracao': 'moveis',
            'moveis_escritorio': 'moveis',

            'pc_gamer': 'pcs',

            'artes_e_artesanato': 'artes',

            'telefonia_fixa': 'telefonia',

            'alimentos': 'alimentos_bebidas',
            'bebidas': 'alimentos_bebidas',

            'cds_dvds_musicais': 'cds_dvds',
            'dvds_blu_ray': 'cds_dvds',

            'portateis_casa_forno_e_cafe': 'eletroportateis',

            'casa_conforto_2': 'casa_conforto',

            'eletrodomesticos_2': 'eletrodomesticos',

            'malas_acessorios': 'fashion',
            'fashion_bolsas_e_acessorios': 'fashion',
            'fashion_calcados': 'fashion',
            'fashion_underwear_e_moda_praia': 'fashion',
            'fashion_roupa_masculina': 'fashion',
            'fashion_esporte': 'fashion',
            'fashion_roupa_feminina': 'fashion',
            'fashion_roupa_infanto_juvenil': 'fashion',

            'eletronicos': 'informatica_acessorios',
            'tablets_impressao_imagem': 'informatica_acessorios',

            'la_cuisine': 'utilidades_domesticas',
            
            'fraldas_higiene': 'bebes'
        }
        self.num_modified = 0
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        modified_rows = X_copy['product_category_name'].replace(self.categorias_a_agrupar)
        self.num_modified = (X_copy['product_category_name'] != modified_rows).sum()
        X_copy['product_category_name'] = modified_rows
        print(f"Agrupamento de categorias concluído. Foram modificadas {self.num_modified} linhas.")
        return X_copy


In [11]:
pipeline_preprocessamento = Pipeline([
    ('Remover duplicatas', RemoveDuplicates()),
    ('Remover linhas nulas', RemoveInvalidRows()),
    ('Tirar colunas desnecessárias', CleanColumns()),
    ('Converter colunas para DateTime', ConvertToDateTime()),
    ('Criando colunas com intervalos de tempo', AddTimeAnalysisColumns()),
    ('Remover intervalos de tempo negativos', RemoveNegatives()),
    ('Remover outliers', OutliersTreatment()),
    ('Agrupar categorias de produtos', AgruparCategorias())
])

df = pd.read_csv("Data/DE&CO_14_06.csv")
#display(df)
frete= pd.read_csv("Data/order_items.csv")
#display(frete)
frete.drop_duplicates(inplace=True)
df=df.merge(frete,how="left",on="order_id")

dados_preprocessados = pipeline_preprocessamento.fit_transform(df)
display(df)


Removendo duplicatas do DataFrame
Tratamento de duplicatas concluído. Foram removidas 25184 linhas.

Removidas 35 linhas devido a valores nulos nas colunas: order_approved_at, product_height_cm.

Removendo colunas específicas do DataFrame
Colunas removidas: ['order_id.1', 'customer_id.1', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'customer_city', 'customer_state', 'seller_city', 'seller_state']

Convertendo coluna order_purchase_timestamp para datetime
Convertendo coluna order_approved_at para datetime
Convertendo coluna order_delivered_carrier_date para datetime
Convertendo coluna order_delivered_customer_date para datetime

Tratamento de números negativos concluído. Foram removidas 0 linhas devido à coluna 'Pedido em aprovação'.
Tratamento de números negativos concluído. Foram removidas 507 linhas devido à coluna 'Pedido em transporte'.
Tratamento de números negativos concluído. Foram removidas 115 linhas devido à coluna 'Diferença entre entrega e con

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,delivery_time,delivery_time_model,product_id_x,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,seller_id_x,seller_zip_code_prefix,seller_city,seller_state,order_id.1,payment_sequential,payment_type,payment_installments,payment_value,customer_id.1,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_item_id,product_id_y,seller_id_y,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,delivered,2017-09-13 08:59:02.000,2017-09-13 09:45:35.000,2017-09-19 18:34:16.000,2017-09-20 23:43:48.000,7.61,15.63,4244733e06e7ecb4970a6e2683c13e61,cool_stuff,58.0,598.0,4.0,650.0,28.0,9.0,14.0,48436dade18ac8b2bce089ec2a041202,27277,volta redonda,SP,00010242fe8c5a6d1ba2dd792cb16214,1,credit_card,2,72.19,3ce436f183e68e07877b285a838db11a,871766c5855e863f6eccc05f988b23cb,28013,campos dos goytacazes,RJ,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35.000,58.90,13.29
1,00018f77f2f0320c557190d7a144bdd3,f6dd3ec061db4e3987629fe6b26e5cce,delivered,2017-04-26 10:53:06.000,2017-04-26 11:05:13.000,2017-05-04 14:35:00.000,2017-05-12 16:04:24.000,16.22,18.55,e5f2d52b802189ee658865ca93d83a8f,pet_shop,56.0,239.0,2.0,30000.0,50.0,30.0,40.0,dd7ddc04e1b6c2c614352b383efe2d36,3471,sao paulo,SP,00018f77f2f0320c557190d7a144bdd3,1,credit_card,3,259.83,f6dd3ec061db4e3987629fe6b26e5cce,eb28e67c4c0b83846050ddfb8a35d051,15775,,,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13.000,239.90,19.93
2,000229ec398224ef6ca0657da4fc703e,6489ae5e4333f3693df5ad4372dab6d3,delivered,2018-01-14 14:33:31.000,2018-01-14 14:48:30.000,2018-01-16 12:36:48.000,2018-01-22 13:19:16.000,7.95,21.39,c777355d18b72b67abbeef9df44fd0fd,moveis_decoracao,59.0,695.0,2.0,3050.0,33.0,13.0,33.0,5b51032eddd242adc84c38acab88f23d,37564,borda da mata,MG,000229ec398224ef6ca0657da4fc703e,1,credit_card,5,216.87,6489ae5e4333f3693df5ad4372dab6d3,3818d81c6709e39d06b2738a8d3a2474,35661,,,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30.000,199.00,17.87
3,00042b26cf59d7ce69dfabb4e55b4fd9,58dbd0b2d70206bf40e62cd34e84d795,delivered,2017-02-04 13:57:51.000,2017-02-04 14:10:13.000,2017-02-16 09:46:09.000,2017-03-01 16:42:31.000,25.11,40.42,ac6c3623068f30de03045865e4e10089,ferramentas_jardim,59.0,409.0,1.0,3750.0,35.0,40.0,30.0,df560393f3a51e74553ab94004ba5c87,87900,loanda,PR,00042b26cf59d7ce69dfabb4e55b4fd9,1,credit_card,3,218.04,58dbd0b2d70206bf40e62cd34e84d795,64b576fb70d441e8f1b2d7d446e483c5,13226,VARZEA PAULISTA,SP,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51.000,199.90,18.14
4,00048cc3ae777c65dbb7d2a0634bc1ea,816cbea969fe5b689b39cfc97a506742,delivered,2017-05-15 21:42:34.000,2017-05-17 03:55:27.000,2017-05-17 11:05:55.000,2017-05-22 13:44:35.000,6.67,21.10,ef92defde845ab8450f9d70c526ef70f,utilidades_domesticas,36.0,558.0,1.0,450.0,24.0,8.0,15.0,6426d21aca402a131fc0a5d0960a3c90,14091,ribeirao preto,SP,00048cc3ae777c65dbb7d2a0634bc1ea,1,boleto,1,34.59,816cbea969fe5b689b39cfc97a506742,85c835d128beae5b4ce8602c491bf385,38017,,,1,ef92defde845ab8450f9d70c526ef70f,6426d21aca402a131fc0a5d0960a3c90,2017-05-23 03:55:27.000,21.90,12.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114191,fffb9224b6fc7c43ebb0904318b10b5f,4d3abb73ceb86353aeadbe698aa9d5cb,delivered,2017-10-27 16:51:00.000,2017-10-28 02:55:58.000,2017-11-10 19:31:52.000,2017-11-17 19:41:42.000,21.12,30.30,43423cdffde7fda63d0414ed38c11a73,relogios_presentes,41.0,1159.0,4.0,350.0,16.0,14.0,11.0,b1fc4f64df5a0e8b6913ab38803c57a9,24440,sao goncalo,RJ,fffb9224b6fc7c43ebb0904318b10b5f,1,boleto,1,356.76,4d3abb73ceb86353aeadbe698aa9d5cb,f736308cd9952b33b90b9fe94da9c8f5,56912,,,2,43423cdffde7fda63d0414ed38c11a73,b1fc4f64df5a0e8b6913ab38803c57a9,2017-11-03 02:55:58.000,55.00,34.19
114192,fffb9224b6fc7c43ebb0904318b10b5f,4d3abb73ceb86353aeadbe698aa9d5cb,delivered,2017-10-27 16:51:00.000,2017-10-28 02:55:58.000,2017-11-10 19:31:52.000,2017-11-17 19:41:42.000,21.12,30.30,43423cdffde7fda63d0414ed38c11a73,relogios_presentes,41.0,1159.0,4.0,350.0,16.0,14.0,11.0,b1fc4f64df5a0e8b6913ab38803c57a9,24440,sao goncalo,RJ,fffb9224b6fc7c43ebb0904318b10b5f,1,boleto,1,356.76,4d3abb73ceb86353aeadbe698aa9d5cb,f736308cd9952b33b90b9fe94da9c8f5,56912,,,3,43423cdffde7fda63d0414ed38c11a73,b1fc4f64df5a0e8b6913ab38803c57a9,2017-11-03 02:55:58.000,55.00,34.19
114193,fffb9224b6fc7c43ebb0904318b10b5f,4d3abb73ceb86353aeadbe698aa9d5cb,delivered,2017-10-27 16:51:00.000,2017-10-28 02:55:58.000,2017-11-10 19:31:52.000,2017-11-17 19:41:42.000,21.12,30.30,43423cdffde7fda63d0414ed38c11a73,relogios_presentes,41.0,1159.0,4.0,350.0,16.0,14.0,11.0,b1fc4f64df5a0e8b6913ab38803c57a9,24440,sao goncalo,RJ,fffb9224b6fc7c43ebb0904318b10b5f,1,boleto,1,356.76,4d3abb73ceb86353aeadbe698aa9d5cb,f736308cd9952b33b90b9fe94da9c8f5,56912,,,4,43423cdffde7fda63d0414ed38c11a73,b1fc4f64df5a0e8b6913ab38803c57a9,2017-11-03 02:55:58.000,55.00,34.19
114194,fffce4705a9662cd70adb13d4a31832d,29309aa813182aaddc9b259e31b870e6,delivered,2017-10-23 17:07:56.000,2017-10-24 17:14:25.000,2017-10-26 15:13:14.000,2017-10-28 12:22:22.000,4.80,17.29,72a30483855e2eafc67aee5dc2560482,esporte_lazer,43.0,869.0,1.0,967.0,21.0,24.0,19.0,c3cfdc648177fdbbbb35635a37472c53,80610,curitiba,PR,fffce4705a9662cd70adb13d4a31832d,1,credit_card,3,116.85,29309aa813182aaddc9b259e31b870e6,cd79b407828f02fdbba457111c38e4c4,4039,SAO PAULO,SP,1,72a30483855e2eafc67aee5dc2560482,c3cfdc648177fdbbbb35635a37472c53,2017-10-30 17:14:25.000,99.90,16.95


In [12]:
# Definindo o caminho para a pasta Data
current_dir = os.path.dirname(os.path.abspath('__file__'))
data_dir = os.path.join(current_dir, 'Data')


# Caminho completo para o arquivo CSV
csv_path = os.path.join(data_dir, 'Dataframelimpa_sem_latlong.csv')


# Exportando o DataFrame para um arquivo CSV
dados_preprocessados.to_csv(csv_path, index=False)