# Data Loading, Cleaning & Merging

### Importazione Librerie

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests

### Caricamento Dati

In [2]:
CSV_files = {
    "customers":           r"dataset/olist_customers_dataset.csv",
    "geolocation_dataset": r"dataset/olist_geolocation_dataset.csv",
    "order_items":         r"dataset/olist_order_items_dataset.csv",
    "order_payments":      r"dataset/olist_order_payments_dataset.csv",
    "order_review":        r"dataset/olist_order_reviews_dataset.csv",
    "order_dataset":       r"dataset/olist_orders_dataset.csv",
    "list_product":        r"dataset/olist_products_dataset.csv",
    "list_seller":         r"dataset/olist_sellers_dataset.csv",
    "product_category":    r"dataset/product_category_name_translation.csv"
}

dataframes = {name: pd.read_csv(path) for name, path in CSV_files.items()}

df_customers           = dataframes["customers"]
df_orders_items        = dataframes["order_items"]
df_order_payments      = dataframes["order_payments"]
df_order_review        = dataframes["order_review"]
df_order_dataset       = dataframes["order_dataset"]
df_list_product        = dataframes["list_product"]
df_list_seller         = dataframes["list_seller"]
df_product_category    = dataframes["product_category"]
df_geolocation_dataset = dataframes["geolocation_dataset"]

### Analisi Preliminare

In [3]:
# Stampa un riepilogo di ogni csv con la sua shape e i missing values.
def quick_overview(df, name):
    print(f"{name}")
    print(f"Shape: {df.shape}")
    nulls = df.isna().sum()
    nulls = nulls[nulls > 0]
    if len(nulls) > 0:
        print(f"Valori nulli:\n{nulls.to_string()}\n")
    else:
        print("Nessun valore nullo\n")

for name, df in dataframes.items():
    quick_overview(df, name)

customers
Shape: (99441, 5)
Nessun valore nullo

geolocation_dataset
Shape: (1000163, 5)
Nessun valore nullo

order_items
Shape: (112650, 7)
Nessun valore nullo

order_payments
Shape: (103886, 5)
Nessun valore nullo

order_review
Shape: (99224, 7)
Valori nulli:
review_comment_title      87656
review_comment_message    58247

order_dataset
Shape: (99441, 8)
Valori nulli:
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965

list_product
Shape: (32951, 9)
Valori nulli:
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2

list_seller
Shape: (3095, 4)
Nessun valore nullo

product_category
Shape: (71, 2)
Nessun valore nullo



### PULIZIA: df_list_product

In [4]:
# 1) Droppiamo le colonne (peso, misure)
# 2) Droppiamo i 610 prodotti senza categoria (< 2% del totale):
#    verranno esclusi anche dagli altri df.
# 3) Uniamo la traduzione inglese della categoria.

# 1
df_list_product = df_list_product.drop(
    columns=['product_name_lenght', 'product_width_cm',
             'product_height_cm', 'product_length_cm', 'product_weight_g']
)

# 2
# Salviamo gli id dei prodotti senza categoria per filtrare order_items dopo
product_id_nan = df_list_product[df_list_product['product_category_name'].isna()]
product_id_to_delete = product_id_nan['product_id'].to_numpy()
df_list_product = df_list_product.dropna(subset=['product_category_name'])

# 3
df_list_product = (
    df_list_product
    .merge(df_product_category, on='product_category_name', how='left')
    .drop(columns=['product_category_name'])
)

df_list_product.info()
df_list_product.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32341 entries, 0 to 32340
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   product_id                     32341 non-null  object 
 1   product_description_lenght     32341 non-null  float64
 2   product_photos_qty             32341 non-null  float64
 3   product_category_name_english  32328 non-null  object 
dtypes: float64(2), object(2)
memory usage: 1010.8+ KB


Unnamed: 0,product_id,product_description_lenght,product_photos_qty,product_category_name_english
11101,c3f7eb8e0fbdf18288a87f5f2f1c0501,1446.0,1.0,housewares
1516,e251a68000f75257cf9ef16bcea51976,207.0,1.0,stationery


### PULIZIA: df_orders_items

In [5]:
# 1) Escludiamo gli items con product_id in product_id_to_delete
# 2) Convertiamo shipping_limit_date in datetime

# 1
df_orders_items = (
    df_orders_items[~df_orders_items['product_id'].isin(product_id_to_delete)]
    .copy()
    .reset_index(drop=True)
)

# 2
df_orders_items['shipping_limit_date'] = pd.to_datetime(
    df_orders_items['shipping_limit_date']
)

df_orders_items.info()
df_orders_items.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111047 entries, 0 to 111046
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             111047 non-null  object        
 1   order_item_id        111047 non-null  int64         
 2   product_id           111047 non-null  object        
 3   seller_id            111047 non-null  object        
 4   shipping_limit_date  111047 non-null  datetime64[ns]
 5   price                111047 non-null  float64       
 6   freight_value        111047 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 5.9+ MB


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
1885,045524a5ae6c874b746523a6fd79f4f9,1,a9fb80187bf64d19dd2804f25e83c5e6,f615fe7efbef0f4f08fd3086bc7a3e60,2017-09-08 18:30:21,112.9,13.13
81134,bafee0b85090ed1545e50c2b5ae871e9,1,8473795696f49e9e79fba701f129828c,5f2684dab12e59f83bef73ae57724e45,2017-09-18 22:25:13,99.9,16.95


### PULIZIA: df_order_dataset

In [6]:
# 1) convertire tutte le colonne data in datetime

# DA RIVEDERE INSIEME SE POSSONO ESSERE UTILI
# 2) delivery_delay_days : differenza tra consegna effettiva e stimata
#    (positivo = in ritardo, negativo = in anticipo)
#    actual_delivery_days: giorni totali dall'acquisto alla consegna

# 1
date_cols = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]
for col in date_cols:
    df_order_dataset[col] = pd.to_datetime(df_order_dataset[col])

# 2
df_order_dataset['delivery_delay_days'] = (
    df_order_dataset['order_delivered_customer_date'] -
    df_order_dataset['order_estimated_delivery_date']
).dt.days
df_order_dataset['actual_delivery_days'] = (
    df_order_dataset['order_delivered_customer_date'] -
    df_order_dataset['order_purchase_timestamp']
).dt.days

df_order_dataset.info()
df_order_dataset.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
 8   delivery_delay_days            96476 non-null  float64       
 9   actual_delivery_days           96476 non-null  float64       
dtypes: datetime64[ns](5), float64(2), object(3)
memory usage: 7.6+ MB


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,delivery_delay_days,actual_delivery_days
42336,cf3b788513af1a4ad6085b96b4954656,98982d16ca0af3d3427aa60f68da06fc,delivered,2017-07-11 23:44:38,2017-07-12 00:03:39,2017-07-12 19:59:52,2017-07-17 14:12:44,2017-07-31,-14.0,5.0
47834,fdf262b020267b9e928c136d05e4dca8,da9c83d94321bbd07861ab46ba121cfc,delivered,2017-10-25 16:54:12,2017-10-25 17:35:39,2017-10-26 14:17:32,2017-10-31 18:08:06,2017-11-17,-17.0,6.0


### PULIZIA: df_order_review

In [7]:
# 1) convertire tutte le colonne data in datetime

# DA RIVEDERE INSIEME
# Alcuni ordini hanno più di una review.
# Teniamo solo l'ultima review per ordine (la più recente).
# Possiamo rimuovere le colonne relative ai commenti o vi interessano?

# 1
df_order_review['review_creation_date'] = pd.to_datetime(
    df_order_review['review_creation_date']
)

df_order_review.sample(2)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
25140,437934b3f4e53e8b5aa77f5b5b0eb1b9,f7d88630da18acd4a11e2ade9eb1bb52,3,bom,Até antes desta compra foi correta,2018-05-11,2018-05-12 14:02:16
69303,e5539ea46bbda2003bac00ae42dafdb4,968ba56969b76f24ea8b93d8a5bb768a,3,,,2018-07-25,2018-07-25 09:06:25


### PULIZIA: df_order_payments

In [8]:
df_order_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [9]:
len(df_order_payments['order_id'])

103886

In [10]:
len(df_order_payments['order_id'].unique())

99440

In [11]:
# DA RIVEDERE INSIEME
# Per un ordine ci sono diverse righe di pagamenti
# Aggreghiamo per order_id sommando il valore totale
# Non credo che ci servirà il tipo di pagamento (payment_type)
# nemmeno li numero di rate del pagamento (payment_installments)
# per ora le ho lasciate

df_order_payments = (
    df_order_payments
    .groupby('order_id', as_index=False)
    .agg(
        total_payment_value  = ('payment_value', 'sum'),
        payment_installments = ('payment_installments', 'max'),
        payment_type         = ('payment_type', 'first')
    )
)

df_order_payments.sample(2)

Unnamed: 0,order_id,total_payment_value,payment_installments,payment_type
61660,9f88232659bf1587e06ac5ab1aa2d62a,108.96,3,credit_card
64042,a598d09b856449e0bf8798180ce4a2a2,55.36,1,boleto


5) PULIZA PRELIMINARE DATAFRAME geolocation E RAGGRUPAMENTO PER ZIP CODE

In [None]:
df_geolocation_clean.sample(3)  

NameError: name 'df_geolocation_clean' is not defined

In [14]:
brazil= {
    "lat_min": -34.0, "lat_max": 5.0,
    "lng_min": -75.0, "lng_max": -28.0
}
df_geolocation_clean = df_geolocation_dataset[df_geolocation_dataset["geolocation_lat"] >= brazil["lat_min"]]
df_geolocation_clean = df_geolocation_clean[df_geolocation_clean["geolocation_lat"] <= brazil["lat_max"]]
df_geolocation_clean = df_geolocation_clean[df_geolocation_clean["geolocation_lng"] >= brazil["lng_min"]]
df_geolocation_clean = df_geolocation_clean[df_geolocation_clean["geolocation_lng"] <= brazil["lng_max"]]
df_geolocation_dataset=df_geolocation_clean     #limitato i valori di lat e lng in brazil
df_geolocation_dataset=df_geolocation_dataset.groupby('geolocation_zip_code_prefix').aggregate({
    'geolocation_lat': 'mean', 'geolocation_lng': 'mean', 'geolocation_city': 'first', 'geolocation_state': 'first'})  #raggruppato per zip code
df_geolocation_dataset.info()
#filtrate le coordinate dentro il Brasile il nome dello stato è spesso scritto in maniera diversa

<class 'pandas.core.frame.DataFrame'>
Index: 19011 entries, 1001 to 99990
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   geolocation_lat    19011 non-null  float64
 1   geolocation_lng    19011 non-null  float64
 2   geolocation_city   19011 non-null  object 
 3   geolocation_state  19011 non-null  object 
dtypes: float64(2), object(2)
memory usage: 742.6+ KB


In [None]:
df_geolocation_dataset.sample(3)

Unnamed: 0_level_0,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
geolocation_zip_code_prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38048,-19.770179,-47.944846,uberaba,MG
74765,-16.64837,-49.220738,goiania,GO
9691,-23.669856,-46.589829,sao bernardo do campo,SP


In [15]:
df_geolocation_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000132 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000132 non-null  int64  
 1   geolocation_lat              1000132 non-null  float64
 2   geolocation_lng              1000132 non-null  float64
 3   geolocation_city             1000132 non-null  object 
 4   geolocation_state            1000132 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 45.8+ MB


In [None]:
df_list_seller=df_list_seller.rename(columns={'seller_zip_code_prefix':'geolocation_zip_code_prefix'})

In [None]:
df_list_seller.sample(3)

Unnamed: 0,seller_id,geolocation_zip_code_prefix,seller_city,seller_state
2132,443d880f15cbd3572885e1d44bf2c478,17506,marilia,SP
102,422be4cc81a457fdb46f47edeb968ae5,14940,ibitinga,SP
1412,6179a28a13a726c29b3bf54c070dccab,37048,varginha,MG


In [None]:
df_customers=df_customers.rename(columns={'customer_zip_code_prefix':'geolocation_zip_code_prefix'})

In [None]:
df_customers.sample(3)

Unnamed: 0,customer_id,customer_unique_id,geolocation_zip_code_prefix,customer_city,customer_state
57857,d608d88b8e4dfd4114634d9759b2f63b,8672605892b455cfa79b6f69ca37d153,36800,carangola,MG
27961,d9a4e05b8c763933f3a6c9640d64a75d,7fb793be0a754449f7f3dd2e09d695ff,28613,nova friburgo,RJ
34714,16c0be52bf7bf9069c6fa0d850c660ac,88b88849a236b3f007be551d5c10daf8,3638,sao paulo,SP


6. PULIZIA DATAFRAME df_order_review

In [17]:
#Le colonne "review_comment_title" e "review_comment_message" non sono utili per l'analisi e contengono valori nulli
## df_clean_review è il dataset pulito senza le due colonne
df_clean_review = df_order_review.drop(columns= ['review_comment_title', 'review_comment_message'], axis = 1)

In [16]:
df_order_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_comment_title     11568 non-null  object        
 4   review_comment_message   40977 non-null  object        
 5   review_creation_date     99224 non-null  datetime64[ns]
 6   review_answer_timestamp  99224 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 5.3+ MB


In [18]:
df_clean_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_creation_date     99224 non-null  datetime64[ns]
 4   review_answer_timestamp  99224 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 3.8+ MB
