In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


### Entendendo as relações entre as tabelas:

<img src="https://i.imgur.com/HRhd2Y0.png" width="600" height="400">

Através do esquema do BD disponibilizado pelo Kaggle, pode-se entender as relações entre as tabelas.

In [43]:
olist_orders_dataset = pd.read_csv(os.path.join('olist', 'olist_orders_dataset.csv'))
olist_order_items_dataset = pd.read_csv(os.path.join('olist', 'olist_order_items_dataset.csv'))
olist_customers_dataset = pd.read_csv(os.path.join('olist', 'olist_customers_dataset.csv'))
olist_products_dataset = pd.read_csv(os.path.join('olist', 'olist_products_dataset.csv'))

In [60]:
olist_products_dataset.duplicated().sum()


0

In [63]:
def verify_duplicate_null(df:pd.DataFrame)->tuple[bool, bool]:
    is_null = True if df.isnull().sum().sum() > 0 else False
    print(f'Colunas com valores nulos:\n{df.isnull().sum()}')
    is_duplicated =  True if df.duplicated().sum() > 0 else False
    print(f'Número de linhas duplicados: {df.duplicated().sum()}')
    return is_null, is_duplicated


### Vamos entender a tabela `olist_orders_dataset`

In [64]:
print(olist_orders_dataset.info())
verify_duplicate_null(olist_orders_dataset)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB
None
Colunas com valores nulos:
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
or

(True, False)

In [65]:
date_columns = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]

for col in date_columns:
    olist_orders_dataset[col] = pd.to_datetime(olist_orders_dataset[col])

# Verify the conversion
olist_orders_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


As linhas que contêm colunas nulas envolvem a dimensão temporal, o que é essencial para análise de dados. Então irei remover as linhas que possuam colunas nulas.

In [67]:
olist_orders_dataset = olist_orders_dataset.dropna()
olist_orders_dataset.info()


<class 'pandas.core.frame.DataFrame'>
Index: 96461 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       96461 non-null  object        
 1   customer_id                    96461 non-null  object        
 2   order_status                   96461 non-null  object        
 3   order_purchase_timestamp       96461 non-null  datetime64[ns]
 4   order_approved_at              96461 non-null  datetime64[ns]
 5   order_delivered_carrier_date   96461 non-null  datetime64[ns]
 6   order_delivered_customer_date  96461 non-null  datetime64[ns]
 7   order_estimated_delivery_date  96461 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.6+ MB


### Vamos agora verificar o `olist_order_items_dataset`

In [68]:
print(olist_order_items_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB
None


In [69]:
verify_duplicate_null(olist_order_items_dataset)

Colunas com valores nulos:
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64
Número de linhas duplicados: 0


(False, False)

In [70]:
olist_order_items_dataset['shipping_limit_date'] = pd.to_datetime(olist_order_items_dataset['shipping_limit_date'])

### Verificação da tabela `olist_products_dataset`

In [71]:
olist_products_dataset.info()
verify_duplicate_null(olist_products_dataset)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB
Colunas com valores nulos:
product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g

(True, False)

In [74]:
olist_products_dataset[olist_products_dataset.isnull().any(axis=1)]
# Considerando que 


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
105,a41e356c76fab66334f36de622ecbd3a,,,,,650.0,17.0,14.0,12.0
128,d8dee61c2034d6d075997acef1870e9b,,,,,300.0,16.0,7.0,20.0
145,56139431d72cd51f19eb9f7dae4d1617,,,,,200.0,20.0,20.0,20.0
154,46b48281eb6d663ced748f324108c733,,,,,18500.0,41.0,30.0,41.0
197,5fb61f482620cb672f5e586bb132eae9,,,,,300.0,35.0,7.0,12.0
...,...,...,...,...,...,...,...,...,...
32515,b0a0c5dd78e644373b199380612c350a,,,,,1800.0,30.0,20.0,70.0
32589,10dbe0fbaa2c505123c17fdc34a63c56,,,,,800.0,30.0,10.0,23.0
32616,bd2ada37b58ae94cc838b9c0569fecd8,,,,,200.0,21.0,8.0,16.0
32772,fa51e914046aab32764c41356b9d4ea4,,,,,1300.0,45.0,16.0,45.0


In [75]:
olist_products_dataset.columns

Index(['product_id', 'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'],
      dtype='object')