In [1]:
import pandas as pd
import json
import os
import shutil
import dask.dataframe as dd
from dask.distributed import Client,default_client
import joblib

In [2]:
# Arrêter les anciennes sessions et planificateurs
try:
    current_client = default_client()
    current_client.close()  # Ferme toutes les connexions actives
except ValueError:
    pass  # Si aucune session n'est active, ignore cette erreur

# Supprimer tous les fichiers temporaires et anciens clients
import shutil
import os

# Supprimez les fichiers temporaires générés par Dask (comme les fichiers .dask-worker-space)
dask_temp_dir = os.path.join(os.getcwd(), 'data/dask-scratch-space')
if os.path.exists(dask_temp_dir):
    shutil.rmtree(dask_temp_dir)  # Supprimer le dossier de l'espace de travail Dask



In [3]:
client = Client(memory_limit='2GB', memory_target_fraction=0.6, local_directory='data')

# Chargement des data sets

In [22]:
parquet_file = "../data/final_output.parquet"
data = dd.read_parquet(parquet_file, blocksize='15MB')

def get_first_n_percent(partition, rows_to_keep):
    return partition.head(rows_to_keep)
data = data.map_partitions(get_first_n_percent, rows_to_keep=200000)

print(f"Nombre de partitions: {data.npartitions}")

Nombre de partitions: 207


In [None]:
import pyarrow.parquet as pq

# Charger le fichier Parquet avec pyarrow pour inspecter sa structure
parquet_file_obj = pq.ParquetFile(parquet_file)
print(f"Nombre de lignes dans le fichier Parquet: {parquet_file_obj.metadata.num_rows}")
print(f"Nombre de colonnes dans le fichier Parquet: {parquet_file_obj.metadata.num_columns}")
print(f"Partitionnement du fichier : {parquet_file_obj.num_row_groups}")


Nombre de lignes dans le fichier Parquet: 216716096
Nombre de colonnes dans le fichier Parquet: 4
Partitionnement du fichier : 207


In [2]:
test_file = "../data/test.jsonl"
with open(test_file, "r", encoding="utf-8") as f:
    df = [json.loads(line) for line in f]
flattened_data = []
for record in df:
    session_id = record["session"]
    for event in record["events"]:
        event["session"] = session_id
        flattened_data.append(event)

panda_df = pd.DataFrame(flattened_data)
test_df = dd.from_pandas(panda_df, npartitions=300)

In [None]:
test_file = "./data/test.jsonl"
with open(test_file, "r", encoding="utf-8") as f:
    df = [json.loads(line) for line in f]
flattened_data = []
for record in df:
    session_id = record["session"]
    for event in record["events"]:
        event["session"] = session_id
        flattened_data.append(event)

panda_df = pd.DataFrame(flattened_data)

In [None]:
panda_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6928123 entries, 0 to 6928122
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   aid      int64 
 1   ts       int64 
 2   type     object
 3   session  int64 
dtypes: int64(3), object(1)
memory usage: 211.4+ MB


In [None]:
parquet_file_test = "./data/test_output.parquet"
data_test = dd.read_parquet(parquet_file, blocksize='15MB')

def get_first_n_percent(partition, rows_to_keep):
    return partition.head(rows_to_keep)
data_test = data_test.map_partitions(get_first_n_percent, rows_to_keep=20000)

print(f"Nombre de partitions: {data_test.npartitions}")

Nombre de partitions: 1


In [None]:
import pyarrow.parquet as pq

# Charger le fichier Parquet avec pyarrow pour inspecter sa structure
parquet_file_obj = pq.ParquetFile(parquet_file_test)
print(f"Nombre de lignes dans le fichier Parquet: {parquet_file_obj.metadata.num_rows}")
print(f"Nombre de colonnes dans le fichier Parquet: {parquet_file_obj.metadata.num_columns}")
print(f"Partitionnement du fichier : {parquet_file_obj.num_row_groups}")


Nombre de lignes dans le fichier Parquet: 6928123
Nombre de colonnes dans le fichier Parquet: 4
Partitionnement du fichier : 7


In [None]:
data_test.head(10)

Unnamed: 0,aid,ts,type,session
0,59625,1661724000278,clicks,12899779
1,1142000,1661724000378,clicks,12899780
2,582732,1661724058352,clicks,12899780
3,973453,1661724109199,clicks,12899780
4,736515,1661724136868,clicks,12899780
5,1142000,1661724155248,clicks,12899780
6,141736,1661724000559,clicks,12899781
7,199008,1661724022851,clicks,12899781
8,57315,1661724170835,clicks,12899781
9,194067,1661724246188,clicks,12899781


In [26]:
parquet_file = "./test_labels.parquet"
labels = dd.read_parquet(parquet_file)

In [28]:
len(labels)

2212692

# Traitement des données

In [3]:
def add_action_reverse(df):
    df["action_reverse"] = df.groupby("session").cumcount()
    df["action_reverse"] = df.groupby("session")["action_reverse"].apply(lambda x: x.max() - x ).reset_index(drop=True) #, meta=('action_reverse', 'int32'))
    return df

def add_session_length(df):
    df['session_length'] = df.groupby('session')['session'].transform('count') #, meta=('session_length', 'int32'))
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1 - 0.1) / (df['session_length'] - 1)) * (df['session_length'] - df['action_reverse'] - 1)
    df['log_recency_score'] = (2 ** linear_interpolation) - 1
    df['log_recency_score'] = df['log_recency_score'].fillna(1)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0: 1, 1: 6, 2: 3}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type'].map(type_weights)
    return df

def apply_pipeline(df, pipeline):
    if isinstance(df, (dd.DataFrame, pd.DataFrame)):
        for f in pipeline:
            df = f(df)
        return df
    else:
        raise TypeError("Input doit être un DataFrame Pandas ou Dask DataFrame")

def process_partition(partition):
    type_mapping = {
        'clicks': 0,
        'carts': 1,
        'orders': 2
    }
    partition['type'] = partition['type'].map(type_mapping).astype('int8')
    partition['ts'] = partition['ts'].astype('int64')

    partition = apply_pipeline(partition, pipeline)
    partition = partition.fillna(0)
    partition = partition.astype({
        'session': 'int32',
        'action_reverse': 'int32',
        'session_length': 'int32',
        'log_recency_score': 'float32',
        'type_weighted_log_recency_score': 'float32',
        'aid': 'int32',
        'ts': 'int64',
        'type': 'int8'
    })

    expected_columns = ['session', 'action_reverse', 'session_length', 'log_recency_score',
                        'type_weighted_log_recency_score', 'aid', 'ts', 'type']

    missing_columns = [col for col in expected_columns if col not in partition.columns]
    if missing_columns:
        raise ValueError(f"Colonnes manquantes après transformation: {missing_columns}")

    return partition[expected_columns]

In [4]:
pipeline = [add_action_reverse, add_session_length, add_log_recency_score, add_type_weighted_log_recency_score]

In [5]:
meta = {
    'session': 'int32',
    'action_reverse': 'int32',
    'session_length': 'int32',
    'log_recency_score': 'float32',
    'type_weighted_log_recency_score': 'float32',
    'aid': 'int32',
    'ts': 'int64',
    'type': 'int8'
}

In [23]:
df_processed = data.map_partitions(process_partition, meta=meta)

In [6]:
test_processed = test_df.map_partitions(process_partition, meta=meta)

In [7]:
test_processed.to_parquet('final_testset.parquet', engine='pyarrow')

In [24]:
df_processed.to_parquet('dataframe_4.parquet', engine='pyarrow')

In [25]:
train = dd.read_parquet('dataframe_4.parquet', blocksize='15MB')
len(train)

41400000

In [None]:
train = dd.read_parquet('dataframe_1.parquet', blocksize='15MB')

In [8]:
test = dd.read_parquet('final_testset.parquet', blocksize='15MB')
test.head()

Unnamed: 0,session,action_reverse,session_length,log_recency_score,type_weighted_log_recency_score,aid,ts,type
0,12899779,0,1,1.0,1.0,59625,1661724000278,0
1,12899780,4,5,0.071773,0.071773,1142000,1661724000378,0
2,12899780,3,5,0.252664,0.252664,582732,1661724058352,0
3,12899780,2,5,0.464086,0.464086,973453,1661724109199,0
4,12899780,1,5,0.71119,0.71119,736515,1661724136868,0


In [None]:
train.tail(20)

Unnamed: 0,session,action_reverse,session_length,log_recency_score,type_weighted_log_recency_score,aid,ts,type
19980,3747760,19,119,0.808865,0.808865,29735,1660758692329,0
19981,3747760,18,119,0.818453,0.136409,29735,1660758699999,1
19982,3747760,17,119,0.828092,0.828092,29735,1660758746118,0
19983,3747760,16,119,0.837783,0.837783,310546,1660758780780,0
19984,3747760,15,119,0.847524,0.141254,310546,1660758839293,1
19985,3747760,14,119,0.857317,0.857317,823143,1660759391052,0
19986,3747760,13,119,0.867162,0.144527,823143,1660759398244,1
19987,3747760,12,119,0.87706,0.87706,823143,1660759421569,0
19988,3747760,11,119,0.88701,0.88701,310546,1660759479681,0
19989,3747760,10,119,0.897012,0.897012,493104,1660859048661,0


In [None]:
df_test_processed = data_test.map_partitions(process_partition, meta=meta)

In [None]:
df_test_processed.head()

Unnamed: 0,session,action_reverse,session_length,log_recency_score,type_weighted_log_recency_score,aid,ts,type
0,5899776,3,4,0.071773,0.071773,1489275,1660039772288,0
1,5899776,2,4,0.319508,0.319508,1826552,1660043110728,0
2,5899776,1,4,0.624505,0.624505,1632206,1660048043858,0
3,5899776,0,4,1.0,1.0,1531634,1660048104470,0
4,5899777,1,2,0.071773,0.071773,1086210,1660039772327,0


In [None]:
print(df_test_processed.dtypes)

session                              int32
action_reverse                       int32
session_length                       int32
log_recency_score                  float32
type_weighted_log_recency_score    float32
aid                                  int32
ts                                   int32
type                                  int8
dtype: object


In [None]:
import pyarrow as pa

schema = pa.schema([
    ('session',pa.int32()),
    ('action_reverse', pa.int32()),
    ('session_length',pa.int32()),
    ('log_recency_score',pa.float32()),
    ('type_weighted_log_recency_score',pa.float32()),
    ('aid',pa.int32()),
    ('ts',pa.int32()),
    ('type',pa.int8())
])

In [None]:
df_test_processed = df_test_processed.astype({
    'session': 'int32',
    'action_reverse': 'int32',
    'session_length': 'int32',
    'log_recency_score': 'float32',
    'type_weighted_log_recency_score': 'float32',
    'aid': 'int32',
    'ts': 'int32',
    'type': 'int8'
})


In [None]:
df_test_processed.to_parquet('dataframe_test.parquet',engine='pyarrow',schema=schema)

In [None]:
test = dd.read_parquet('dataframe_test.parquet', blocksize='15MB')

# Ground truth

In [29]:
type2id = {"clicks": 0, "carts": 1, "orders": 2}

df_train_labels = labels.explode('ground_truth')

df_train_labels['aid'] = df_train_labels['ground_truth']
df_train_labels['type'] = df_train_labels['type'].map(type2id)
df_train_labels = df_train_labels[['session', 'type', 'aid']]

df_train_labels['session'] = df_train_labels['session'].astype('int32')
df_train_labels['type'] = df_train_labels['type'].astype('uint8')
df_train_labels['aid'] = df_train_labels['aid'].astype('int32')

df_train_labels['gt'] = 1

df_train = train.merge(df_train_labels, on=['session', 'type', 'aid'], how='left')

df_train['gt'] = df_train['gt'].fillna(0).astype('uint8')

#train.to_parquet('train_processed.parquet', write_index=False)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('type', 'float64'))

+------------------+------------+-------------+
| Merge columns    | left dtype | right dtype |
+------------------+------------+-------------+
| ('type', 'type') | int8       | uint8       |
+------------------+------------+-------------+
Cast dtypes explicitly to avoid unexpected results.


In [30]:
df_train.head()

Unnamed: 0,session,action_reverse,session_length,log_recency_score,type_weighted_log_recency_score,aid,ts,type,gt
0,5899776,3,4,0.071773,0.071773,1489275,1660039772288,0,0
1,5899776,2,4,0.319508,0.319508,1826552,1660043110728,0,0
2,5899776,1,4,0.624505,0.624505,1632206,1660048043858,0,0
3,5899776,0,4,1.0,1.0,1531634,1660048104470,0,0
4,5899777,1,2,0.071773,0.071773,1086210,1660039772327,0,0


In [31]:
def get_session_lengths(df):
    return df.groupby('session')['session'].count().compute().to_numpy()

In [None]:
session_lengths_train = get_session_lengths(df_train)

In [None]:
session_lengths_train

array([10, 39, 34, ...,  3, 12, 50], shape=(239072,))

# Model training

In [None]:
from lightgbm.sklearn import LGBMRanker

In [None]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=20,
    importance_type='gain',
)

In [None]:
feature_cols = ['aid', 'type', 'action_reverse', 'session_length', 'log_recency_score', 'type_weighted_log_recency_score']
target = 'gt'

In [None]:
df_train_pd = df_train[feature_cols].compute()
target_pd = df_train[target].compute()

In [None]:
ranker = ranker.fit(
    df_train_pd,
    target_pd,
    group=session_lengths_train,
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.336858 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1278
[LightGBM] [Info] Number of data points in the train set: 4140000, number of used features: 6


In [None]:
joblib.dump(ranker, "lightgbm_ranker.pkl")

['lightgbm_ranker.pkl']

In [None]:
ranker = joblib.load("lightgbm_ranker.pkl")