In [2]:
import pandas as pd
import pyarrow.parquet as pq
import os

In [3]:
# Dossiers source et destination
data_dir = "../data/processed/"  # Dossier contenant les fichiers Parquet bruts
cleaned_dir = "../data/cleaned/"  # Dossier où stocker les fichiers nettoyés
os.makedirs(cleaned_dir, exist_ok=True)  # Créer le dossier de destination s'il n'existe pas

# Liste des fichiers Parquet à traiter
parquet_files = [f for f in os.listdir(data_dir) if f.endswith(".parquet")]

def clean_data(df):
    """Effectue le nettoyage et l'optimisation des données."""
    # Suppression des doublons
    df = df.drop_duplicates()
    
    # Gestion des valeurs manquantes
    df["category_code"] = df["category_code"].fillna("unknown")
    df["brand"] = df["brand"].fillna("unknown")
    df["user_session"] = df["user_session"].fillna("unknown")
    
    # Optimisation des types de données
    df["price"] = df["price"].astype("float32")
    df["product_id"] = df["product_id"].astype("int32")
    df["category_id"] = df["category_id"].astype("int32")
    df["user_id"] = df["user_id"].astype("int32")
    
    return df

# Nettoyage des fichiers Parquet
for file in parquet_files:
    parquet_path = os.path.join(data_dir, file)
    cleaned_path = os.path.join(cleaned_dir, file)
    
    print(f"🔹 Nettoyage de {file}...")
    
    # Lire en batch avec PyArrow ParquetFile
    parquet_file = pq.ParquetFile(parquet_path)
    batch_size = 500_000  # Lire 500k lignes à la fois
    
    chunk_list = []
    for batch in parquet_file.iter_batches(batch_size=batch_size):
        chunk = batch.to_pandas()
        chunk_cleaned = clean_data(chunk)
        chunk_list.append(chunk_cleaned)
    
    df_cleaned = pd.concat(chunk_list)
    df_cleaned.to_parquet(cleaned_path, index=False)
    
    print(f"✅ {file} nettoyé et sauvegardé dans {cleaned_path}")

print("🎯 Tous les fichiers ont été nettoyés et stockés dans data/cleaned/ !")

🔹 Nettoyage de 2019-Dec.parquet...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_code"] = df["category_code"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["brand"] = df["brand"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user_session"] = df["user_session"].fillna("unknown")
A value is trying to be set on a copy of a 

✅ 2019-Dec.parquet nettoyé et sauvegardé dans ../data/cleaned/2019-Dec.parquet
🔹 Nettoyage de 2019-Nov.parquet...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_code"] = df["category_code"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["brand"] = df["brand"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user_session"] = df["user_session"].fillna("unknown")
A value is trying to be set on a copy of a 

✅ 2019-Nov.parquet nettoyé et sauvegardé dans ../data/cleaned/2019-Nov.parquet
🔹 Nettoyage de 2019-Oct.parquet...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_code"] = df["category_code"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["brand"] = df["brand"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user_session"] = df["user_session"].fillna("unknown")
A value is trying to be set on a copy of a 

✅ 2019-Oct.parquet nettoyé et sauvegardé dans ../data/cleaned/2019-Oct.parquet
🔹 Nettoyage de 2020-Apr.parquet...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_code"] = df["category_code"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["brand"] = df["brand"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user_session"] = df["user_session"].fillna("unknown")
A value is trying to be set on a copy of a 

✅ 2020-Apr.parquet nettoyé et sauvegardé dans ../data/cleaned/2020-Apr.parquet
🔹 Nettoyage de 2020-Feb.parquet...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_code"] = df["category_code"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["brand"] = df["brand"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user_session"] = df["user_session"].fillna("unknown")
A value is trying to be set on a copy of a 

✅ 2020-Feb.parquet nettoyé et sauvegardé dans ../data/cleaned/2020-Feb.parquet
🔹 Nettoyage de 2020-Jan.parquet...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_code"] = df["category_code"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["brand"] = df["brand"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user_session"] = df["user_session"].fillna("unknown")
A value is trying to be set on a copy of a 

✅ 2020-Jan.parquet nettoyé et sauvegardé dans ../data/cleaned/2020-Jan.parquet
🔹 Nettoyage de 2020-Mar.parquet...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_code"] = df["category_code"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["brand"] = df["brand"].fillna("unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user_session"] = df["user_session"].fillna("unknown")
A value is trying to be set on a copy of a 

✅ 2020-Mar.parquet nettoyé et sauvegardé dans ../data/cleaned/2020-Mar.parquet
🎯 Tous les fichiers ont été nettoyés et stockés dans data/cleaned/ !
