# Data Loading, Cleaning & Merging

### Importazione Librerie

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests

### Caricamento Dati

In [13]:
CSV_files = {
    "customers":           r"dataset/olist_customers_dataset.csv",
    "geolocation_dataset": r"dataset/olist_geolocation_dataset.csv",
    "order_items":         r"dataset/olist_order_items_dataset.csv",
    "order_payments":      r"dataset/olist_order_payments_dataset.csv",
    "order_review":        r"dataset/olist_order_reviews_dataset.csv",
    "order_dataset":       r"dataset/olist_orders_dataset.csv",
    "list_product":        r"dataset/olist_products_dataset.csv",
    "list_seller":         r"dataset/olist_sellers_dataset.csv",
    "product_category":    r"dataset/product_category_name_translation.csv"
}

dataframes = {name: pd.read_csv(path) for name, path in CSV_files.items()}

df_customers          = dataframes["customers"]
df_orders_items       = dataframes["order_items"]
df_order_payments     = dataframes["order_payments"]
df_order_review       = dataframes["order_review"]
df_order_dataset      = dataframes["order_dataset"]
df_list_product       = dataframes["list_product"]
df_list_seller        = dataframes["list_seller"]
df_product_category   = dataframes["product_category"]
df_geolocation_dataset= dataframes["geolocation_dataset"]

### Analisi Preliminare

In [18]:
# Stampa un riepilogo rapido di un DataFrame con la sua shape e i missing values.
def quick_overview(df, name):
    print(f"{name}")
    print(f"Shape: {df.shape}")
    nulls = df.isna().sum()
    nulls = nulls[nulls > 0]
    if len(nulls) > 0:
        print(f"Valori nulli:\n{nulls.to_string()}\n")
    else:
        print("Nessun valore nullo\n")

for name, df in dataframes.items():
    quick_overview(df, name)

customers
Shape: (99441, 5)
Nessun valore nullo

geolocation_dataset
Shape: (1000163, 5)
Nessun valore nullo

order_items
Shape: (112650, 7)
Nessun valore nullo

order_payments
Shape: (103886, 5)
Nessun valore nullo

order_review
Shape: (99224, 7)
Valori nulli:
review_comment_title      87656
review_comment_message    58247

order_dataset
Shape: (99441, 8)
Valori nulli:
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965

list_product
Shape: (32951, 9)
Valori nulli:
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2

list_seller
Shape: (3095, 4)
Nessun valore nullo

product_category
Shape: (71, 2)
Nessun valore nullo



3. PULIZIA PRELIMINARE DEI DATAFRAME (list_product e merge delo stesso con ctagory name)

In [166]:
df_list_product.info()#individuati 610 records che non riportano la categoria prodotto

<class 'pandas.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  str    
 1   product_category_name       32341 non-null  str    
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), str(2)
memory usage: 2.3 MB


In [167]:
df_list_product=df_list_product.drop(['product_name_lenght','product_width_cm','product_height_cm','product_length_cm','product_weight_g'],axis=1)
#cancellazione delle colonne relative alle dimensioni(lunghezza del nome,largfhezza,altezza, lunghezza, larghezzza)
#LASCIATO QUANTITA' FOTO E LUNGHEZZA DESCRIZIONE (ELEMENTI PER UN ANALISI?? PIù VENDUTI CON PIù FOTO PIù VENDUTI CON PIù DESCRIZIONE)


In [168]:
product_id_nan=df_list_product[df_list_product['product_category_name'].isna()]
product_id_to_delete=product_id_nan['product_id']
product_id_to_delete=product_id_to_delete.to_numpy()
#creato un array con gli id prodototto che non hanno categorie per usarlo come criteri di esclusione negli altri df

In [169]:
df_list_product=df_list_product.dropna()
df_list_product.info()#pulito il df  da i valori nulli probabilemnte ho perso 1 id product 32340 invece di  32341

<class 'pandas.DataFrame'>
Index: 32341 entries, 0 to 32950
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32341 non-null  str    
 1   product_category_name       32341 non-null  str    
 2   product_description_lenght  32341 non-null  float64
 3   product_photos_qty          32341 non-null  float64
dtypes: float64(2), str(2)
memory usage: 1.2 MB


In [170]:
df_list_product=df_list_product.merge(df_product_category,on='product_category_name')
#AGGREGATA LA COLONNA CON IL NOME I INLGESE DELLA CATEGORIA PRODOTTO

In [171]:
df_list_product=df_list_product.drop(['product_category_name'],axis=1)

In [172]:
df_list_product.sample(2)#pulito il database-lista prodotti e aggragte callse categoria prodotto in inglese

Unnamed: 0,product_id,product_description_lenght,product_photos_qty,product_category_name_english
2323,f5d9f3f39171645ad59c424570874afb,967.0,1.0,health_beauty
25892,ad4b5def91ac7c575dbdf65b5be311f4,665.0,1.0,computers_accessories


4) PULIZIA PRELIMINARE DATAFRAME df_orders_items e cambio formato to datetime

In [173]:
df_orders_items_clean=df_orders_items[~df_orders_items['product_id'].isin(product_id_to_delete)]
df_orders_items_clean.info()#creato il dataframe df_orders ripulito dai prodotti senza categoria

<class 'pandas.DataFrame'>
Index: 111047 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             111047 non-null  str    
 1   order_item_id        111047 non-null  int64  
 2   product_id           111047 non-null  str    
 3   seller_id            111047 non-null  str    
 4   shipping_limit_date  111047 non-null  str    
 5   price                111047 non-null  float64
 6   freight_value        111047 non-null  float64
dtypes: float64(2), int64(1), str(4)
memory usage: 6.8 MB


In [174]:
df_orders_items_clean['shipping_limit_date']=pd.to_datetime(df_orders_items_clean['shipping_limit_date'])
#convertito in data la colonna di shipping date time

In [182]:
df_orders_items_clean.sample(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
36712,535981e0e0cb431bfd3a85a4279f03d9,1,aa0d48704d551d1247be98ca06dfa990,d4e12e7884759a14fa0f5f896c791cae,2017-10-19 13:49:24,49.8,14.1
31595,47aa694d87f6ae4995bd02d6e8d9132d,1,b17a6664cea6ac6c23ff388f5a67c973,7040e82f899a04d1b434b795a43b4617,2018-04-03 23:15:12,24.9,8.29


5) PULIZA PRELIMINARE DATAFRAME geolocation E RAGGRUPAMENTO PER ZIP CODE

In [175]:
df_geolocation_clean.sample(3)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
43570,3022,-23.524591,-46.605241,sao paulo,SP
733939,58807,-6.762441,-38.234228,sousa,PB
340126,14701,-20.956453,-48.482396,bebedouro,SP


In [176]:
brazil= {
    "lat_min": -34.0, "lat_max": 5.0,
    "lng_min": -75.0, "lng_max": -28.0
}
df_geolocation_clean = df_geolocation_dataset[df_geolocation_dataset["geolocation_lat"] >= brazil["lat_min"]]
df_geolocation_clean = df_geolocation_clean[df_geolocation_clean["geolocation_lat"] <= brazil["lat_max"]]
df_geolocation_clean = df_geolocation_clean[df_geolocation_clean["geolocation_lng"] >= brazil["lng_min"]]
df_geolocation_clean = df_geolocation_clean[df_geolocation_clean["geolocation_lng"] <= brazil["lng_max"]]
df_geolocation_dataset=df_geolocation_clean     #limitato i valori di lat e lng in brazil
df_geolocation_dataset=df_geolocation_dataset.groupby('geolocation_zip_code_prefix').aggregate({
    'geolocation_lat': 'mean', 'geolocation_lng': 'mean', 'geolocation_city': 'first', 'geolocation_state': 'first'})  #raggruppato per zip code
df_geolocation_dataset.info()
#filtrate le coordinate dentro il Brasile il nome dello stato è spesso scritto in maniera diversa

<class 'pandas.DataFrame'>
Index: 19011 entries, 1001 to 99990
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   geolocation_lat    19011 non-null  float64
 1   geolocation_lng    19011 non-null  float64
 2   geolocation_city   19011 non-null  str    
 3   geolocation_state  19011 non-null  str    
dtypes: float64(2), str(2)
memory usage: 742.6 KB


In [177]:
df_geolocation_dataset.sample(3)

Unnamed: 0_level_0,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
geolocation_zip_code_prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38048,-19.770179,-47.944846,uberaba,MG
74765,-16.64837,-49.220738,goiania,GO
9691,-23.669856,-46.589829,sao bernardo do campo,SP


In [178]:
df_list_seller=df_list_seller.rename(columns={'seller_zip_code_prefix':'geolocation_zip_code_prefix'})

In [215]:
df_list_seller.sample(3)

Unnamed: 0,seller_id,geolocation_zip_code_prefix,seller_city,seller_state
2132,443d880f15cbd3572885e1d44bf2c478,17506,marilia,SP
102,422be4cc81a457fdb46f47edeb968ae5,14940,ibitinga,SP
1412,6179a28a13a726c29b3bf54c070dccab,37048,varginha,MG


In [None]:
df_customers=df_customers.rename(columns={'customer_zip_code_prefix':'geolocation_zip_code_prefix'})

In [222]:
df_customers.sample(3)

Unnamed: 0,customer_id,customer_unique_id,geolocation_zip_code_prefix,customer_city,customer_state
57857,d608d88b8e4dfd4114634d9759b2f63b,8672605892b455cfa79b6f69ca37d153,36800,carangola,MG
27961,d9a4e05b8c763933f3a6c9640d64a75d,7fb793be0a754449f7f3dd2e09d695ff,28613,nova friburgo,RJ
34714,16c0be52bf7bf9069c6fa0d850c660ac,88b88849a236b3f007be551d5c10daf8,3638,sao paulo,SP
