# Data Loading, Cleaning & Merging

### Importazione Librerie

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests

### Caricamento Dati

In [29]:
CSV_files = {
    "customers":           r"dataset/olist_customers_dataset.csv",
    "geolocation_dataset": r"dataset/olist_geolocation_dataset.csv",
    "order_items":         r"dataset/olist_order_items_dataset.csv",
    "order_payments":      r"dataset/olist_order_payments_dataset.csv",
    "order_review":        r"dataset/olist_order_reviews_dataset.csv",
    "order_dataset":       r"dataset/olist_orders_dataset.csv",
    "list_product":        r"dataset/olist_products_dataset.csv",
    "list_seller":         r"dataset/olist_sellers_dataset.csv",
    "product_category":    r"dataset/product_category_name_translation.csv"
}

dataframes = {name: pd.read_csv(path) for name, path in CSV_files.items()}

df_customers           = dataframes["customers"]
df_orders_items        = dataframes["order_items"]
df_order_payments      = dataframes["order_payments"]
df_order_review        = dataframes["order_review"]
df_order_dataset       = dataframes["order_dataset"]
df_list_product        = dataframes["list_product"]
df_list_seller         = dataframes["list_seller"]
df_product_category    = dataframes["product_category"]
df_geolocation_dataset = dataframes["geolocation_dataset"]

### Analisi Preliminare

In [30]:
# Stampa un riepilogo di ogni csv con la sua shape e i missing values.
def quick_overview(df, name):
    print(f"{name}")
    print(f"Shape: {df.shape}")
    nulls = df.isna().sum()
    nulls = nulls[nulls > 0]
    if len(nulls) > 0:
        print(f"Valori nulli:\n{nulls.to_string()}\n")
    else:
        print("Nessun valore nullo\n")

for name, df in dataframes.items():
    quick_overview(df, name)

customers
Shape: (99441, 5)
Nessun valore nullo

geolocation_dataset
Shape: (1000163, 5)
Nessun valore nullo

order_items
Shape: (112650, 7)
Nessun valore nullo

order_payments
Shape: (103886, 5)
Nessun valore nullo

order_review
Shape: (99224, 7)
Valori nulli:
review_comment_title      87656
review_comment_message    58247

order_dataset
Shape: (99441, 8)
Valori nulli:
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965

list_product
Shape: (32951, 9)
Valori nulli:
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2

list_seller
Shape: (3095, 4)
Nessun valore nullo

product_category
Shape: (71, 2)
Nessun valore nullo



### PULIZIA: df_list_product

In [31]:
# 1) Droppiamo le colonne (peso, misure)
# 2) Droppiamo i 610 prodotti senza categoria (< 2% del totale):
#    verranno esclusi anche dagli altri df.
# 3) Uniamo la traduzione inglese della categoria.

# 1
df_list_product = df_list_product.drop(
    columns=['product_name_lenght', 'product_width_cm',
             'product_height_cm', 'product_length_cm', 'product_weight_g']
)

# 2
# Salviamo gli id dei prodotti senza categoria per filtrare order_items dopo
product_id_nan = df_list_product[df_list_product['product_category_name'].isna()]
product_id_to_delete = product_id_nan['product_id'].to_numpy()
df_list_product = df_list_product.dropna(subset=['product_category_name'])

# 3
df_list_product = (
    df_list_product
    .merge(df_product_category, on='product_category_name', how='left')
    .drop(columns=['product_category_name'])
)

df_list_product.info()
df_list_product.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32341 entries, 0 to 32340
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   product_id                     32341 non-null  object 
 1   product_description_lenght     32341 non-null  float64
 2   product_photos_qty             32341 non-null  float64
 3   product_category_name_english  32328 non-null  object 
dtypes: float64(2), object(2)
memory usage: 1010.8+ KB


Unnamed: 0,product_id,product_description_lenght,product_photos_qty,product_category_name_english
30553,e91042dbe3df0b9887d7044968286f57,2283.0,6.0,costruction_tools_tools
26783,2a49ec12c4a8acae6944c72a19359a5e,269.0,2.0,auto


In [None]:
df_list_product

### PULIZIA: df_orders_items

In [32]:
# 1) Escludiamo gli items con product_id in product_id_to_delete
# 2) Convertiamo shipping_limit_date in datetime

# 1
df_orders_items = (
    df_orders_items[~df_orders_items['product_id'].isin(product_id_to_delete)]
    .copy()
    .reset_index(drop=True)
)

# 2
df_orders_items['shipping_limit_date'] = pd.to_datetime(
    df_orders_items['shipping_limit_date']
)

df_orders_items.info()
df_orders_items.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111047 entries, 0 to 111046
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             111047 non-null  object        
 1   order_item_id        111047 non-null  int64         
 2   product_id           111047 non-null  object        
 3   seller_id            111047 non-null  object        
 4   shipping_limit_date  111047 non-null  datetime64[ns]
 5   price                111047 non-null  float64       
 6   freight_value        111047 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 5.9+ MB


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
40895,5e8125228086e366702d4aab930260d9,1,389d119b48cf3043d311335e499d9c6b,1f50f920176fa81dab994f9023523100,2017-12-04 02:56:05,49.0,17.67
39612,5b7a9a3c182d3c773dfd622600c5d0a8,1,581da9e6a6eda905b74a303dd719dd12,abe42c5d03695b4257b5c6cbf4e6784e,2018-06-17 23:35:35,359.0,17.39


### PULIZIA: df_order_dataset

In [33]:
# 1) convertire tutte le colonne data in datetime

# DA RIVEDERE INSIEME SE POSSONO ESSERE UTILI
# 2) delivery_delay_days : differenza tra consegna effettiva e stimata
#    (positivo = in ritardo, negativo = in anticipo)
#    actual_delivery_days: giorni totali dall'acquisto alla consegna

# 1
date_cols = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]
for col in date_cols:
    df_order_dataset[col] = pd.to_datetime(df_order_dataset[col])

# 2
df_order_dataset['delivery_delay_days'] = (
    df_order_dataset['order_delivered_customer_date'] -
    df_order_dataset['order_estimated_delivery_date']
).dt.days
df_order_dataset['actual_delivery_days'] = (
    df_order_dataset['order_delivered_customer_date'] -
    df_order_dataset['order_purchase_timestamp']
).dt.days

df_order_dataset.info()
df_order_dataset.sample(2)

KeyboardInterrupt: 

### PULIZIA: df_order_review

In [None]:
# 1) convertire tutte le colonne data in datetime

# DA RIVEDERE INSIEME
# Alcuni ordini hanno più di una review.
# Teniamo solo l'ultima review per ordine (la più recente).
# Possiamo rimuovere le colonne relative ai commenti o vi interessano?

# 1
df_order_review['review_creation_date'] = pd.to_datetime(
    df_order_review['review_creation_date']
)

df_order_review.sample(2)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
59296,62b2c89cf60779146decd6abe4bc6d02,7459e9507588bf704829087454fd707b,5,,,2017-09-26,2017-09-28 00:12:26
12065,dacd2e5e526d498a99465c40d18e62c6,47c1158d35b8d9f0453d5f2ddbbcb3b0,5,,,2018-08-01,2018-08-03 18:18:13


### PULIZIA: df_order_payments

In [None]:
df_order_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [None]:
len(df_order_payments['order_id'])

103886

In [None]:
len(df_order_payments['order_id'].unique())

99440

In [None]:
# DA RIVEDERE INSIEME
# Per un ordine ci sono diverse righe di pagamenti
# Aggreghiamo per order_id sommando il valore totale
# Non credo che ci servirà il tipo di pagamento (payment_type)
# nemmeno li numero di rate del pagamento (payment_installments)
# per ora le ho lasciate

df_order_payments = (
    df_order_payments
    .groupby('order_id', as_index=False)
    .agg(
        total_payment_value  = ('payment_value', 'sum'),
        payment_installments = ('payment_installments', 'max'),
        payment_type         = ('payment_type', 'first')
    )
)

df_order_payments.sample(2)

Unnamed: 0,order_id,total_payment_value,payment_installments,payment_type
52408,8764a51bae349203d355fdb26ab44ba3,55.09,2,credit_card
58354,97258b92dad9851d26d0a0eed03c22da,35.68,1,boleto


5) PULIZA PRELIMINARE DATAFRAME geolocation E RAGGRUPAMENTO PER ZIP CODE

In [None]:
df_geolocation_clean.sample(3)  

NameError: name 'df_geolocation_clean' is not defined

In [None]:
brazil= {
    "lat_min": -34.0, "lat_max": 5.0,
    "lng_min": -75.0, "lng_max": -28.0
}
df_geolocation_clean = df_geolocation_dataset[df_geolocation_dataset["geolocation_lat"] >= brazil["lat_min"]]
df_geolocation_clean = df_geolocation_clean[df_geolocation_clean["geolocation_lat"] <= brazil["lat_max"]]
df_geolocation_clean = df_geolocation_clean[df_geolocation_clean["geolocation_lng"] >= brazil["lng_min"]]
df_geolocation_clean = df_geolocation_clean[df_geolocation_clean["geolocation_lng"] <= brazil["lng_max"]]
df_geolocation_dataset=df_geolocation_clean     #limitato i valori di lat e lng in brazil
df_geolocation_dataset=df_geolocation_dataset.groupby('geolocation_zip_code_prefix').aggregate({
    'geolocation_lat': 'mean', 'geolocation_lng': 'mean', 'geolocation_city': 'first', 'geolocation_state': 'first'})  #raggruppato per zip code
df_geolocation_dataset.info()
#filtrate le coordinate dentro il Brasile il nome dello stato è spesso scritto in maniera diversa

<class 'pandas.core.frame.DataFrame'>
Index: 19011 entries, 1001 to 99990
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   geolocation_lat    19011 non-null  float64
 1   geolocation_lng    19011 non-null  float64
 2   geolocation_city   19011 non-null  object 
 3   geolocation_state  19011 non-null  object 
dtypes: float64(2), object(2)
memory usage: 742.6+ KB


In [34]:
df_geolocation_dataset.sample(3)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
534890,29255,-20.435736,-40.769864,marechal floriano,ES
585658,35162,-19.466931,-42.56052,ipatinga,MG
806776,75706,-18.164892,-47.877195,catalao,GO


In [35]:
df_geolocation_clean.info()

NameError: name 'df_geolocation_clean' is not defined

In [36]:
df_list_seller=df_list_seller.rename(columns={'seller_zip_code_prefix':'geolocation_zip_code_prefix'})

In [37]:
df_list_seller.sample(3)

Unnamed: 0,seller_id,geolocation_zip_code_prefix,seller_city,seller_state
2927,bc39d8938f90a3a2b98193723ed59774,7791,cajamar,SP
545,1a8e2d9c38b84a9702ac7922924b0573,89245,araquari,SC
1912,cc63f0dd2acba93ffed4fe9f8e0321fa,15025,sao jose do rio preto,SP


In [38]:
df_customers=df_customers.rename(columns={'customer_zip_code_prefix':'geolocation_zip_code_prefix'})

In [39]:
df_customers.sample(3)

Unnamed: 0,customer_id,customer_unique_id,geolocation_zip_code_prefix,customer_city,customer_state
54860,ee6efaac6f06a51ec3fb8f07a5970fa0,39d6725d99d64de3b46d0e9ba56d3cde,72725,brasilia,DF
95444,f75a9fad9c2ef07542ad3eb5c7f1c5f0,23166e3098740c2240fe5f187b5de215,7124,guarulhos,SP
72119,ec189a5b668cb11749a9f1b4e8f19759,cdcdd214bea2510e96523a6e58cfbe9f,24110,niteroi,RJ


6. PULIZIA DATAFRAME df_order_review

In [40]:
#Le colonne "review_comment_title" e "review_comment_message" non sono utili per l'analisi e contengono valori nulli
## df_clean_review è il dataset pulito senza le due colonne
df_clean_review = df_order_review.drop(columns= ['review_comment_title', 'review_comment_message'], axis = 1)

In [41]:
df_order_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


In [42]:
df_clean_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_creation_date     99224 non-null  object
 4   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


7. Test Ipotesi: I prodotti che presentano un maggior numero di metodi di pagamento hanno un tasso di vendita maggiore?

In [52]:
df_order_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [53]:
df_orders_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111047 entries, 0 to 111046
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             111047 non-null  object        
 1   order_item_id        111047 non-null  int64         
 2   product_id           111047 non-null  object        
 3   seller_id            111047 non-null  object        
 4   shipping_limit_date  111047 non-null  datetime64[ns]
 5   price                111047 non-null  float64       
 6   freight_value        111047 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 5.9+ MB


In [63]:
df_list_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32341 entries, 0 to 32340
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   product_id                     32341 non-null  object 
 1   product_description_lenght     32341 non-null  float64
 2   product_photos_qty             32341 non-null  float64
 3   product_category_name_english  32328 non-null  object 
dtypes: float64(2), object(2)
memory usage: 1010.8+ KB


In [64]:
m_order_dataset_items = df_order_dataset.merge(df_orders_items[['order_id', 'product_id',]], how = 'left', on = 'order_id')
m_order_items_payments = df_orders_items.merge(df_order_payments[['order_id','payment_type']], how = 'left', on = 'order_id')
m_order_items_products = df_orders_items.merge(df_list_product[['product_id', 'product_description_lenght', 'product_category_name_english']], how = 'left', on = 'product_id')
m_order_items_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115906 entries, 0 to 115905
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             115906 non-null  object        
 1   order_item_id        115906 non-null  int64         
 2   product_id           115906 non-null  object        
 3   seller_id            115906 non-null  object        
 4   shipping_limit_date  115906 non-null  datetime64[ns]
 5   price                115906 non-null  float64       
 6   freight_value        115906 non-null  float64       
 7   payment_type         115903 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 7.1+ MB


In [65]:
m_order_dataset_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113211 entries, 0 to 113210
Data columns (total 9 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       113211 non-null  object        
 1   customer_id                    113211 non-null  object        
 2   order_status                   113211 non-null  object        
 3   order_purchase_timestamp       113211 non-null  datetime64[ns]
 4   order_approved_at              113050 non-null  datetime64[ns]
 5   order_delivered_carrier_date   111249 non-null  datetime64[ns]
 6   order_delivered_customer_date  109991 non-null  datetime64[ns]
 7   order_estimated_delivery_date  113211 non-null  object        
 8   product_id                     111047 non-null  object        
dtypes: datetime64[ns](4), object(5)
memory usage: 7.8+ MB


In [66]:
m_order_items_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111047 entries, 0 to 111046
Data columns (total 9 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       111047 non-null  object        
 1   order_item_id                  111047 non-null  int64         
 2   product_id                     111047 non-null  object        
 3   seller_id                      111047 non-null  object        
 4   shipping_limit_date            111047 non-null  datetime64[ns]
 5   price                          111047 non-null  float64       
 6   freight_value                  111047 non-null  float64       
 7   product_description_lenght     111047 non-null  float64       
 8   product_category_name_english  111023 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 7.6+ MB


In [59]:
m_order_items_payments.head()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,payment_type
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,credit_card
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93,credit_card
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87,credit_card
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79,credit_card
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14,credit_card


In [67]:
df_list_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32341 entries, 0 to 32340
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   product_id                     32341 non-null  object 
 1   product_description_lenght     32341 non-null  float64
 2   product_photos_qty             32341 non-null  float64
 3   product_category_name_english  32328 non-null  object 
dtypes: float64(2), object(2)
memory usage: 1010.8+ KB
