In [1]:
import pandas as pd 
import json
import os
import shutil
import dask.dataframe as dd
from dask.distributed import Client

In [None]:
client = Client(memory_limit='2GB', memory_target_fraction=0.6, local_directory='/path/to/directory')

2025-03-18 18:15:27,745 - distributed.scheduler - ERROR - Couldn't gather keys: {('repartitiontofewer-441198df06082af0cf55ab41d6e6ca66', 0): 'waiting'}
2025-03-18 18:15:48,298 - distributed.scheduler - ERROR - Couldn't gather keys: {('repartitiontofewer-441198df06082af0cf55ab41d6e6ca66', 0): 'waiting'}
2025-03-18 18:16:13,341 - distributed.scheduler - ERROR - Couldn't gather keys: {('repartitiontofewer-441198df06082af0cf55ab41d6e6ca66', 0): 'waiting'}
2025-03-18 18:16:47,770 - distributed.scheduler - ERROR - Couldn't gather keys: {('repartitiontofewer-441198df06082af0cf55ab41d6e6ca66', 0): 'waiting'}
2025-03-18 18:17:24,637 - distributed.scheduler - ERROR - Couldn't gather keys: {('repartitiontofewer-441198df06082af0cf55ab41d6e6ca66', 0): 'waiting'}
2025-03-18 18:17:52,168 - distributed.scheduler - ERROR - Couldn't gather keys: {('repartitiontofewer-441198df06082af0cf55ab41d6e6ca66', 0): 'waiting'}
2025-03-18 18:18:21,930 - distributed.scheduler - ERROR - Couldn't gather keys: {('repar

# Chargement des data sets

In [None]:
parquet_file = "../data/final_output.parquet"
data = dd.read_parquet(parquet_file, blocksize='15MB')

def get_first_n_percent(partition, rows_to_keep):
    return partition.head(rows_to_keep) 
data = data.map_partitions(get_first_n_percent, rows_to_keep=20000)

print(f"Nombre de partitions: {data.npartitions}")

Nombre de partitions: 207


# Traitement des données

In [4]:
def add_action_reverse(df):
    df["action_reverse"] = df.groupby("session").cumcount()
    df["action_reverse"] = df.groupby("session")["action_reverse"].apply(lambda x: x.max() - x ).reset_index(drop=True) #, meta=('action_reverse', 'int32'))
    return df

def add_session_length(df):
    df['session_length'] = df.groupby('session')['session'].transform('count') #, meta=('session_length', 'int32'))
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1 - 0.1) / (df['session_length'] - 1)) * (df['session_length'] - df['action_reverse'] - 1)
    df['log_recency_score'] = (2 ** linear_interpolation) - 1
    df['log_recency_score'] = df['log_recency_score'].fillna(1)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0: 1, 1: 6, 2: 3}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df

def apply_pipeline(df, pipeline):
    if isinstance(df, (dd.DataFrame, pd.DataFrame)):
        for f in pipeline:
            df = f(df)
        return df
    else:
        raise TypeError("Input doit être un DataFrame Pandas ou Dask DataFrame")
    
def process_partition(partition):
    type_mapping = {
        'clicks': 0,
        'carts': 1,
        'orders': 2
    }
    partition['type'] = partition['type'].map(type_mapping)
    partition['type'] = partition['type'].astype('int8')
    partition = apply_pipeline(partition, pipeline)
    expected_columns = ['session', 'action_reverse', 'session_length', 'log_recency_score', 
                        'type_weighted_log_recency_score', 'aid', 'ts', 'type']
    missing_columns = [col for col in expected_columns if col not in partition.columns]
    if missing_columns:
        raise ValueError(f"Colonnes manquantes après transformation: {missing_columns}")
    return partition[expected_columns] 

In [5]:
pipeline = [add_action_reverse, add_session_length, add_log_recency_score, add_type_weighted_log_recency_score]

In [6]:
meta = {
    'session': 'int32',
    'action_reverse': 'int32',
    'session_length': 'int32',
    'log_recency_score': 'float32',
    'type_weighted_log_recency_score': 'float32',
    'aid': 'int32',
    'ts': 'int32',
    'type': 'int8'
}

In [7]:
df_processed = data.map_partitions(process_partition, meta=meta)

In [None]:
output.to_parquet('dataframe.parquet', engine='pyarrow')

In [11]:
data_2 = dd.read_parquet('dataframe.parquet', blocksize='15MB')

In [15]:
data_2.tail()

Unnamed: 0,session,action_reverse,session_length,log_recency_score,type_weighted_log_recency_score,aid,ts,type
19995,3747760,4,119,0.95815,0.95815,29735,1660859207339,0
19996,3747760,3,119,0.96853,0.161422,29735,1660859222698,1
19997,3747760,2,119,0.978965,0.978965,29735,1661187445585,0
19998,3747760,1,119,0.989454,0.164909,29735,1661187500843,1
19999,3747760,0,119,1.0,1.0,29735,1661187583074,0
