# Data Loading, Cleaning & Export

### Importazione Librerie

In [45]:
import numpy as np
import pandas as pd
import requests

### Caricamento Dati

In [46]:
CSV_files = {
    "customers":           r"dataset/olist_customers_dataset.csv",
    "geolocation_dataset": r"dataset/olist_geolocation_dataset.csv",
    "order_items":         r"dataset/olist_order_items_dataset.csv",
    "order_payments":      r"dataset/olist_order_payments_dataset.csv",
    "order_review":        r"dataset/olist_order_reviews_dataset.csv",
    "order_dataset":       r"dataset/olist_orders_dataset.csv",
    "list_product":        r"dataset/olist_products_dataset.csv",
    "list_seller":         r"dataset/olist_sellers_dataset.csv",
    "product_category":    r"dataset/product_category_name_translation.csv"
}

dataframes = {name: pd.read_csv(path) for name, path in CSV_files.items()}

df_customers           = dataframes["customers"]
df_orders_items        = dataframes["order_items"]
df_order_payments      = dataframes["order_payments"]
df_order_review        = dataframes["order_review"]
df_order_dataset       = dataframes["order_dataset"]
df_list_product        = dataframes["list_product"]
df_list_seller         = dataframes["list_seller"]
df_product_category    = dataframes["product_category"]
df_geolocation_dataset = dataframes["geolocation_dataset"]

### Analisi Preliminare

In [47]:
# Stampa un riepilogo di ogni csv con la sua shape, i missing values e i duplicati.
def quick_overview(df, name):

    # Nome DF
    print(f"{name}")
    print(f"Shape: {df.shape}")

    # Valori nulli
    nulls = df.isna().sum()
    nulls = nulls[nulls > 0]
    if len(nulls) > 0:
        print(f"Valori nulli:\n{nulls.to_string()}\n")
    else:
        print("Nessun valore nullo\n")

for name, df in dataframes.items():
    quick_overview(df, name)

customers
Shape: (99441, 5)
Nessun valore nullo

geolocation_dataset
Shape: (1000163, 5)
Nessun valore nullo

order_items
Shape: (112650, 7)
Nessun valore nullo

order_payments
Shape: (103886, 5)
Nessun valore nullo

order_review
Shape: (99224, 7)
Valori nulli:
review_comment_title      87656
review_comment_message    58247

order_dataset
Shape: (99441, 8)
Valori nulli:
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965

list_product
Shape: (32951, 9)
Valori nulli:
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2

list_seller
Shape: (3095, 4)
Nessun valore nullo

product_category
Shape: (71, 2)
Nessun valore nullo



### PULIZIA: df_list_product

In [48]:
# 1) Droppiamo le colonne (peso, misure)
# 2) Droppiamo i 610 prodotti senza categoria (< 2% del totale):
#    verranno esclusi anche dagli altri df.
# 3) Uniamo la traduzione inglese della categoria.

# 1
df_list_product = df_list_product.drop(
    columns=['product_name_lenght', 'product_width_cm',
             'product_height_cm', 'product_length_cm', 'product_weight_g']
)

# 2
# Salviamo gli id dei prodotti senza categoria per filtrare order_items dopo
product_id_nan = df_list_product[df_list_product['product_category_name'].isna()]
product_id_to_delete = product_id_nan['product_id'].to_numpy()
df_list_product = df_list_product.dropna(subset=['product_category_name'])

# 3
df_list_product = (
    df_list_product
    .merge(df_product_category, on='product_category_name', how='left')
)

df_list_product.info()
df_list_product.sample(2)

<class 'pandas.DataFrame'>
RangeIndex: 32341 entries, 0 to 32340
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   product_id                     32341 non-null  str    
 1   product_category_name          32341 non-null  str    
 2   product_description_lenght     32341 non-null  float64
 3   product_photos_qty             32341 non-null  float64
 4   product_category_name_english  32328 non-null  str    
dtypes: float64(2), str(3)
memory usage: 1.2 MB


Unnamed: 0,product_id,product_category_name,product_description_lenght,product_photos_qty,product_category_name_english
11173,f1136b963c743b647b6c0b3d86effe08,moveis_sala,256.0,1.0,furniture_living_room
28957,78ae9099d0b8c74be6cb8deb9bd84f8a,bebes,325.0,1.0,baby


In [49]:
# Traduciamo con Selenium le categorie portoghesi senza traduzione inglese
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Troviamo i nomi portoghesi che non hanno traduzione nel dataframe df_product_category
traduzione_mancante = df_list_product['product_category_name_english'].isna()
nomi_mancanti = df_list_product[traduzione_mancante]['product_category_name'].unique()
print(f"Categorie da tradurre ({len(nomi_mancanti)}): \n{nomi_mancanti}\n")


Categorie da tradurre (2): 
<StringArray>
['pc_gamer', 'portateis_cozinha_e_preparadores_de_alimentos']
Length: 2, dtype: str



In [50]:
# Apre Google Translate nel browser e restituisce la traduzione
# dal portoghese all'inglese del testo passato, formattata in snake_case.
def traduci_categoria(testo):

    # Costruisce l'URL di Google Translate con i parametri: lingua portoghese (pt), lingua inglese (en) e testo
    url = f"https://translate.google.com/?sl=pt&tl=en&text={testo}&op=translate"
    driver.get(url)
    
    # Attende 2 secondi che la pagina carichi e la traduzione appaia
    time.sleep(2)

    # Accetta i cookie se il popup è presente
    try:
        accept_button = driver.find_element(By.XPATH, "//button[.//span[text()='Accept all']]")
        accept_button.click()
        time.sleep(2)
    except:
        pass  # Il popup non è presente, continua normalmente
    
    # Trova l'elemento HTML che contiene il testo tradotto
    risultato = driver.find_element(By.XPATH, "//span[@jsname='W297wb']")
    
    # Converte in minuscolo e sostituisce gli spazi con underscore
    traduzione = risultato.text.lower().replace(' ', '_')
    return traduzione

# Avvia Firefox
# NB CONTROLLATE TUTTI SE SI APRE IL BROWSER E VEDETE GOOGLE TRANSLATE
driver = webdriver.Firefox()

# Dizionario che conterrà le traduzioni: {nome_portoghese: traduzione_inglese}
traduzioni_manuali = {}

# Itera sui nomi di categoria mancanti di traduzione
for nome in nomi_mancanti:
    traduzione = traduci_categoria(nome)
    traduzioni_manuali[nome] = traduzione
    print(f"{nome}  →  {traduzione}")
    time.sleep(1)

# Chiude il browser al termine delle traduzioni
driver.quit()

# Aggiorna il DataFrame: per ogni categoria tradotta,
# riempie i valori NaN nella colonna 'product_category_name_english'
for nome, traduzione in traduzioni_manuali.items():
    df_list_product.loc[
        df_list_product['product_category_name'] == nome,
        'product_category_name_english'
    ] = traduzione

df_list_product.info()

pc_gamer  →  gaming_pc
portateis_cozinha_e_preparadores_de_alimentos  →  portateis_kitchen_e_food_preparators
<class 'pandas.DataFrame'>
RangeIndex: 32341 entries, 0 to 32340
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   product_id                     32341 non-null  str    
 1   product_category_name          32341 non-null  str    
 2   product_description_lenght     32341 non-null  float64
 3   product_photos_qty             32341 non-null  float64
 4   product_category_name_english  32341 non-null  str    
dtypes: float64(2), str(3)
memory usage: 1.2 MB


### PULIZIA: df_orders_items

In [51]:
url = 'https://v6.exchangerate-api.com/v6/ee8e65018f5adf36f58283bd/latest/EUR'
oae = requests.get(url)
print(oae.status_code)#richiesta API  

200


In [52]:
EUR_BRL=oae.json()['conversion_rates']['BRL']#Stanziata la variabile del tasso di cambio chiamndolo EUR_BRL
EUR_BRL#tasso conversione

6.1312

In [53]:
# 1) Escludiamo gli items con product_id in product_id_to_delete
# 2) Convertiamo shipping_limit_date in datetime

# 1
df_orders_items = (
    df_orders_items[~df_orders_items['product_id'].isin(product_id_to_delete)]
    .copy()
    .reset_index(drop=True)
)

# 2
df_orders_items['shipping_limit_date'] = pd.to_datetime(
    df_orders_items['shipping_limit_date']
)

#conversione in euro di price e freight_value
df_orders_items['eur_price']=round(df_orders_items['price']/EUR_BRL,2)
df_orders_items['eur_freight_value']=round(df_orders_items['freight_value']/EUR_BRL,2)
#rimozione colonne in real breasiliano
df_orders_items=df_orders_items.drop(['price','freight_value'],axis=1)


df_orders_items.info()
df_orders_items.sample(2)

<class 'pandas.DataFrame'>
RangeIndex: 111047 entries, 0 to 111046
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             111047 non-null  str           
 1   order_item_id        111047 non-null  int64         
 2   product_id           111047 non-null  str           
 3   seller_id            111047 non-null  str           
 4   shipping_limit_date  111047 non-null  datetime64[us]
 5   eur_price            111047 non-null  float64       
 6   eur_freight_value    111047 non-null  float64       
dtypes: datetime64[us](1), float64(2), int64(1), str(3)
memory usage: 5.9 MB


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,eur_price,eur_freight_value
101556,e9f0c4fe73c293e664edaa5e8a9b04eb,1,ca38c039b717207dec906f4fa9a21db7,59fb871bf6f4522a87ba567b42dafecf,2017-03-23 22:05:51,30.99,3.49
103361,ee33d838de5dc1568b5b31d505ccdafb,1,1c4de4dd53a801716c8a3a2cf51f35fa,6560211a19b47992c3666cc44a7e94c0,2018-06-11 00:52:23,30.83,3.13


### PULIZIA: df_order_dataset

In [54]:
# 1) Conversione delle colonne data in datetime

# 2) delivery_delay_days : differenza tra consegna effettiva e stimata
#    (positivo = in ritardo, negativo = in anticipo)
#    actual_delivery_days: giorni totali dall'acquisto alla consegna

# 1
date_cols = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]
for col in date_cols:
    df_order_dataset[col] = pd.to_datetime(df_order_dataset[col])

# 2
df_order_dataset['delivery_delay_days'] = (
    df_order_dataset['order_delivered_customer_date'] -
    df_order_dataset['order_estimated_delivery_date']
).dt.days
df_order_dataset['actual_delivery_days'] = (
    df_order_dataset['order_delivered_customer_date'] -
    df_order_dataset['order_purchase_timestamp']
).dt.days

df_order_dataset.info()
df_order_dataset.sample(2)

<class 'pandas.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  str           
 1   customer_id                    99441 non-null  str           
 2   order_status                   99441 non-null  str           
 3   order_purchase_timestamp       99441 non-null  datetime64[us]
 4   order_approved_at              99281 non-null  datetime64[us]
 5   order_delivered_carrier_date   97658 non-null  datetime64[us]
 6   order_delivered_customer_date  96476 non-null  datetime64[us]
 7   order_estimated_delivery_date  99441 non-null  datetime64[us]
 8   delivery_delay_days            96476 non-null  float64       
 9   actual_delivery_days           96476 non-null  float64       
dtypes: datetime64[us](5), float64(2), str(3)
memory usage: 7.6 MB


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,delivery_delay_days,actual_delivery_days
15865,a396a3ced410ba6d02ebb907e5c34530,fcc915c201f58b20e92faf7fad1d9c35,delivered,2017-12-01 08:04:37,2017-12-02 02:51:15,2017-12-05 01:25:46,2017-12-28 17:04:01,2017-12-28,0.0,27.0
78203,55a0842ea8508da4c9ef096cbac8bfef,d2898df8d7ab23976854d61a684ef6bd,delivered,2018-07-31 19:31:09,2018-07-31 20:23:52,2018-08-01 16:30:00,2018-08-06 23:28:54,2018-08-14,-8.0,6.0


### PULIZIA: df_order_review

In [55]:
# 1) Conversione della colonna review_creation_date in datetime
# 2) Droppiamo le colonne (review_comment_title, review_comment_message)

# 1
df_order_review['review_creation_date'] = pd.to_datetime(
    df_order_review['review_creation_date']
)

# 2
df_order_review = df_order_review.drop(columns= ['review_comment_title', 'review_comment_message'])

df_order_review.sample(2)

Unnamed: 0,review_id,order_id,review_score,review_creation_date,review_answer_timestamp
20438,7c3601d26ab8b32b9d1c04ec72e9b424,d2c4b43f2986e614c4f281927c47d228,3,2018-05-23,2018-05-24 10:02:04
70923,5df1865626cb737cfdddc58a184664be,343c79502b5f394194fbf53ba4a561c1,5,2018-07-27,2018-07-27 19:16:41


### PULIZIA: df_order_payments

In [56]:
# Per un ordine ci sono diverse righe di pagamenti
# Aggreghiamo per order_id

df_order_payments = (
    df_order_payments
    .groupby('order_id', as_index=False)
    .agg(
        total_payment_value= ('payment_value', 'sum'),
        payment_installments= ('payment_installments', 'max'),
        payment_type         = ('payment_type', 'first')
    )
)

df_order_payments.sample(2)

Unnamed: 0,order_id,total_payment_value,payment_installments,payment_type
15877,2922d9dd6672241cb658a40bf507e4b1,111.98,6,credit_card
19887,3363886715280209c8fb0bba5cc4ab95,227.27,5,credit_card


In [57]:
df_order_payments['eur_total_payment_value']=round(df_order_payments['total_payment_value']/EUR_BRL,2)#convesrione
df_order_payments=df_order_payments.drop(['total_payment_value'],axis=1)

In [58]:
df_order_payments.sample(2)

Unnamed: 0,order_id,payment_installments,payment_type,eur_total_payment_value
31264,5042f28675b7eedd6c772f3c0df075f1,1,credit_card,4.3
91749,ec108f8c96f1037449cfb5e118d9a008,1,credit_card,6.46


### PULIZIA:df_geolocation

In [59]:
# Coordinate del Brasile
brazil = {
    "lat_min": -34.0, "lat_max": 5.0,
    "lng_min": -75.0, "lng_max": -28.0
}

# Filtra coordinate fuori dal Brasile
df_geolocation_clean = df_geolocation_dataset[
    df_geolocation_dataset["geolocation_lat"].between(brazil["lat_min"], brazil["lat_max"]) &
    df_geolocation_dataset["geolocation_lng"].between(brazil["lng_min"], brazil["lng_max"])
].copy()

# Raggruppa per zip code (media lat/lng, primo valore per città e stato)
df_geolocation_dataset = (
    df_geolocation_clean
    .groupby('geolocation_zip_code_prefix', as_index=False)
    .agg(
        geolocation_lat   = ('geolocation_lat',   'mean'),
        geolocation_lng   = ('geolocation_lng',   'mean'),
        geolocation_city  = ('geolocation_city',  'first'),
        geolocation_state = ('geolocation_state', 'first')
    )
)

df_geolocation_dataset.info()
df_geolocation_dataset.sample(2)

<class 'pandas.DataFrame'>
RangeIndex: 19011 entries, 0 to 19010
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   geolocation_zip_code_prefix  19011 non-null  int64  
 1   geolocation_lat              19011 non-null  float64
 2   geolocation_lng              19011 non-null  float64
 3   geolocation_city             19011 non-null  str    
 4   geolocation_state            19011 non-null  str    
dtypes: float64(2), int64(1), str(2)
memory usage: 742.7 KB


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
3019,5854,-23.660886,-46.762107,sao paulo,SP
4503,11701,-24.008923,-46.419125,praia grande,SP


### PULIZIA: df_list_seller

In [60]:
df_list_seller=df_list_seller.rename(columns={'seller_zip_code_prefix':'geolocation_zip_code_prefix'})
df_list_seller.sample(3)

Unnamed: 0,seller_id,geolocation_zip_code_prefix,seller_city,seller_state
1797,1d0997ff06b524ce9289ffd75114ecd3,52020,recife,PE
1387,9b00cad94ef3078faf6fba2e792c158f,39442,janauba,MG
844,6c17baf138731a4aaacc3210adf2037b,14400,franca,SP


### PULIZIA: df_customers

In [61]:
df_customers=df_customers.rename(columns={'customer_zip_code_prefix':'geolocation_zip_code_prefix'})
df_customers.sample(3)

Unnamed: 0,customer_id,customer_unique_id,geolocation_zip_code_prefix,customer_city,customer_state
73924,8fcebad47cb22e7a7576c02a882a66ed,1dc64a765cc16cb5c9962dcb25491751,7193,guarulhos,SP
98432,766d8c95b0b9445ca63f3cfae64fa55f,004493e0b0a37317d25665d340c0677d,21235,rio de janeiro,RJ
38478,bf988543b99218d77ffea19b8953d511,3d3162c248f84fc1ef4636b83bf097fe,38800,sao gotardo,MG


### CREAZIONE DI DATAFRAME GLOBAL

#### Dataframe global (aggregazione per ordini)

In [62]:
# 1. Merge fra list_seller e geolocation_clean
df_seller_geo = df_list_seller.merge(
    df_geolocation_clean,
    on='geolocation_zip_code_prefix',
    how='left'
)
#dal merge ci sono 7 valori nulli per lat, lng e state >> rimpiazzo i valori con 0 e missing
df_seller_geo['geolocation_lat'] = df_seller_geo['geolocation_lat'].fillna(0)
df_seller_geo['geolocation_lng'] = df_seller_geo['geolocation_lng'].fillna(0)
df_seller_geo['geolocation_city'] = df_seller_geo['geolocation_city'].fillna('missing')
df_seller_geo['geolocation_state'] = df_seller_geo['geolocation_state'].fillna('missing')

# 2. Merge fra customers e geolocation_clean
df_customer_geo = df_list_seller.merge(
    df_geolocation_clean,
    on='geolocation_zip_code_prefix', 
    how='left'
)
#dal merge ci sono 7 valori nulli per lat, lng e state >> rimpiazzo i valori con 0 e missing
df_customer_geo['geolocation_lat'] = df_customer_geo['geolocation_lat'].fillna(0)
df_customer_geo['geolocation_lng'] = df_customer_geo['geolocation_lng'].fillna(0)
df_customer_geo['geolocation_city'] = df_customer_geo['geolocation_city'].fillna('missing')
df_customer_geo['geolocation_state'] = df_customer_geo['geolocation_state'].fillna('missing')

# 3. Aggregazione order_items per order_id
agg_orders_items = (df_orders_items.groupby('order_id', as_index = False).agg(
    total_items=('order_item_id', 'count'),       # numero di prodotti venduti
    total_price=('eur_price', 'sum'),                 # somma prezzi
    freight_total=('eur_freight_value', 'sum'),       # somma costi spedizione
    freight_avg=('eur_freight_value', 'mean'),        # media costi spedizione
    first_shipping_limit=('shipping_limit_date', 'min')  # primo limite spedizione
)
)

# 4. Aggregazione order_payments per order_id
agg_order_payments = (
    df_order_payments
    .groupby('order_id', as_index=False)
    .agg(
        total_payment_value= ('eur_total_payment_value', 'sum'),
        payment_installments= ('payment_installments', 'max'),
        payment_type         = ('payment_type', 'first')
    )
)

# 5. Aggregazione order_review per order_id
agg_order_review = (
    df_order_review
    .groupby('order_id', as_index=False)
    .agg(
    avg_review=('review_score', 'mean'),
    num_reviews=('review_score', 'count'),
    firs_review_date = ('review_creation_date', 'min'),
    first_review_answer = ('review_answer_timestamp', 'min')
    )
)

# 6. Aggregazione list_product per order_id
agg_list_products = (
    df_orders_items
    .merge(df_list_product, on='product_id', how='left')
    .groupby('order_id', as_index=False)
    .agg(
        num_products=('product_id', 'count'),
        num_categories=('product_category_name_english', 'nunique')
    )
)

# . Aggregazione list_seller per order_id
agg_seller = (
    df_orders_items
    .merge(df_seller_geo, on='seller_id', how='left')
    .groupby('order_id', as_index=False)
    .agg(
        num_sellers = ('seller_id', 'count'),
        geolocation_zip_code_prefix = ('geolocation_zip_code_prefix', 'first'),
        seller_city = ('seller_city', 'first'),
        seller_state = ('seller_state', 'first')
    )   
)        

# Merge
df_global = df_order_dataset.merge(df_customers, on='customer_id', how='left') \
                     .merge(agg_orders_items, on='order_id', how='left') \
                     .merge(agg_order_review, on='order_id', how='left') \
                     .merge(agg_order_payments, on='order_id', how='left') \
                     .merge(agg_list_products, on='order_id', how='left') \
                     .merge(agg_seller, on='order_id', how='left')
                     



In [63]:
df_global.info()

<class 'pandas.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  str           
 1   customer_id                    99441 non-null  str           
 2   order_status                   99441 non-null  str           
 3   order_purchase_timestamp       99441 non-null  datetime64[us]
 4   order_approved_at              99281 non-null  datetime64[us]
 5   order_delivered_carrier_date   97658 non-null  datetime64[us]
 6   order_delivered_customer_date  96476 non-null  datetime64[us]
 7   order_estimated_delivery_date  99441 non-null  datetime64[us]
 8   delivery_delay_days            96476 non-null  float64       
 9   actual_delivery_days           96476 non-null  float64       
 10  customer_unique_id             99441 non-null  str           
 11  geolocation_zip_code_prefi

In [64]:
#PULIZIA DATI

# 1768 ordini non hanno review - Inserisco 0 al posto del valore nullo
df_global['avg_review'] = df_global['avg_review'].fillna(0)
df_global['num_reviews'] = df_global['num_reviews'].fillna(0)
# Feature booleana che identifica 0 come valore nullo e non come score
df_global['has_review'] = df_global['num_reviews'] > 0

# Controllo i valori nulli (97277)
cols_items_seller = [
    'total_items', 'total_price', 'freight_total', 'freight_avg',
    'num_products', 'num_categories', 'num_sellers',
    'seller_city', 'seller_state', 'geolocation_zip_code_prefix_y'
]
rows_with_nulls = df_global[df_global[cols_items_seller].isna().any(axis=1)]
same_nulls = rows_with_nulls[cols_items_seller].isna().all(axis=1)

#Sostituisco i valori stringa con "unknown"
cat_cols = ['seller_city', 'seller_state']
df_global[cat_cols] = df_global[cat_cols].fillna('unknown')
df_global['has_items'] = df_global['total_items'] > 0

# Sostituisco null nei valori numerici dei pagamenti
df_global['total_payment_value'] = df_global['total_payment_value'].fillna(0)
df_global['payment_installments'] = df_global['payment_installments'].fillna(0)
# Sostituisco null nel metodo di pagamento con 'unknown'
df_global['payment_type'] = df_global['payment_type'].fillna('unknown')
# Facoltativo: feature booleana se ordine ha pagamento
df_global['has_payment'] = df_global['total_payment_value'] > 0

In [65]:
df_global.info()

<class 'pandas.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  str           
 1   customer_id                    99441 non-null  str           
 2   order_status                   99441 non-null  str           
 3   order_purchase_timestamp       99441 non-null  datetime64[us]
 4   order_approved_at              99281 non-null  datetime64[us]
 5   order_delivered_carrier_date   97658 non-null  datetime64[us]
 6   order_delivered_customer_date  96476 non-null  datetime64[us]
 7   order_estimated_delivery_date  99441 non-null  datetime64[us]
 8   delivery_delay_days            96476 non-null  float64       
 9   actual_delivery_days           96476 non-null  float64       
 10  customer_unique_id             99441 non-null  str           
 11  geolocation_zip_code_prefi

### EXPORT: tutti i dataframe puliti nella cartella output

In [66]:
import os

# Crea la cartella output se non esiste
os.makedirs("output", exist_ok=True)

dataframes_to_export = {
    "list_product":        df_list_product,
    "orders_items":        df_orders_items,
    "order_dataset":       df_order_dataset,
    "order_review":        df_order_review,
    "order_payments":      df_order_payments,
    "geolocation_dataset": df_geolocation_dataset,
    "list_seller":         df_list_seller,
    "customers":           df_customers,
}

for name, df in dataframes_to_export.items():
    df.to_csv(f"output/{name}.csv", index=False)