In [3]:
import pandas as pd
import pyarrow.parquet as pq

In [4]:
def clean_data(df):
    """Effectue le nettoyage et l'optimisation des données."""
    df = df.copy()  # Éviter SettingWithCopyWarning

    # Suppression des doublons
    df.drop_duplicates(inplace=True)

    # Gestion des valeurs manquantes
    for col in ["category_code", "brand", "user_session"]:
        df[col] = df[col].fillna("unknown")

    # Optimisation des types de données
    type_mapping = {
        "price": "float32",
        "product_id": "int32",
        "category_id": "int32",
        "user_id": "int32"
    }

    for col, dtype in type_mapping.items():
        df[col] = df[col].astype(dtype)

    return df

def process_parquet_file(parquet_path, cleaned_path, batch_size=500_000):
    """Charge, nettoie et enregistre un fichier Parquet en mode batch pour éviter les problèmes de mémoire."""
    parquet_file = pq.ParquetFile(parquet_path)
    chunk_list = []

    for batch in parquet_file.iter_batches(batch_size=batch_size):
        chunk = batch.to_pandas()
        chunk_cleaned = clean_data(chunk)
        chunk_list.append(chunk_cleaned)

    df_cleaned = pd.concat(chunk_list, ignore_index=True)
    df_cleaned.to_parquet(cleaned_path, index=False)

In [1]:
import pandas as pd

# Charger un fichier Parquet
df = pd.read_parquet("../data/cleaned/2019-Dec.parquet")

# Afficher les 5 premières lignes
print(df.head())


                event_time event_type  product_id  category_id  \
0  2019-12-01 00:00:00 UTC       view     1005105   1451229556   
1  2019-12-01 00:00:00 UTC       view    22700068     16777546   
2  2019-12-01 00:00:01 UTC       view     2402273    553648671   
3  2019-12-01 00:00:02 UTC   purchase    26400248    -50331391   
4  2019-12-01 00:00:02 UTC       view    20100164   1283457772   

                   category_code    brand        price    user_id  \
0       construction.tools.light    apple  1302.479980  556695836   
1                        unknown    force   102.959999  577702456   
2   appliances.personal.massager    bosch   313.519989  539453785   
3  computers.peripherals.printer  unknown   132.309998  535135317   
4               apparel.trousers     nika   101.680000  517987650   

                           user_session  
0  ca5eefc5-11f9-450c-91ed-380285a0bc80  
1  de33debe-c7bf-44e8-8a12-3bf8421f842a  
2  5ee185a7-0689-4a33-923d-ba0130929a76  
3  61792a26-672f-4e6

In [6]:
process_parquet_file("../data/processed/2019-Dec.parquet", "../data/cleaned/2019-Dec.parquet", batch_size=500_000)