In [1]:
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
dataset_dir = "../dataset/"

## Dataset

**Articles :**

In [2]:
articles_df = pd.read_csv(dataset_dir + 'articles_metadata.csv')
print("DF shape :", articles_df.shape)
print("Number of unique articles : ", articles_df['article_id'].nunique())
print("Number of unique publishers : ", articles_df['publisher_id'].nunique())
print("Number of unique article categories : ", articles_df['category_id'].nunique())

print("\n===============================\nArticles metadata DataFrame:")
print(f"Features: {articles_df.columns}")
articles_df.head(1)

articles_df = articles_df.drop(columns=['publisher_id'])
articles_df.head(1)

DF shape : (364047, 5)
Number of unique articles :  364047
Number of unique publishers :  1
Number of unique article categories :  461

Articles metadata DataFrame:
Features: Index(['article_id', 'category_id', 'created_at_ts', 'publisher_id',
       'words_count'],
      dtype='object')


Unnamed: 0,article_id,category_id,created_at_ts,words_count
0,0,0,1513144419000,168


**Interactions utilisateurs :**

In [3]:
clicks_dir = os.path.join(dataset_dir, "clicks")
files = [file for file in os.listdir(clicks_dir)
         if file.startswith('clicks_hour_') and file.endswith('.csv')]
dfs = [pd.read_csv(os.path.join(clicks_dir, f)) for f in files]
clicks_df = pd.concat(dfs, ignore_index=True)
print("Shape: ", clicks_df.shape)
print("===============================\nClicks DataFrame:")
print(f"Features: {clicks_df.columns}")
clicks_df.head(1)

Shape:  (2988181, 12)
Clicks DataFrame:
Features: Index(['user_id', 'session_id', 'session_start', 'session_size',
       'click_article_id', 'click_timestamp', 'click_environment',
       'click_deviceGroup', 'click_os', 'click_country', 'click_region',
       'click_referrer_type'],
      dtype='object')


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2


### Features articles

In [None]:
def get_recency_weight(days_count):
    return 1 / (1 + days_count)

def build_article_features(articles_df, clicks_df):
    df = articles_df.copy()
    clicks = clicks_df.copy()

    # --- Temporal features ---
    # Convert timestamp to datetime
    clicks['click_time'] = pd.to_datetime(clicks['click_timestamp'], unit='ms', errors='coerce')
    df['created_at_time'] = pd.to_datetime(df['created_at_ts'], unit='ms', errors='coerce')

    # Age in days relative to dataset max timestamp (freshness)
    max_time = pd.concat([df['created_at_time'], clicks["click_time"]]).max()
    df['article_age_days'] = (max_time - df['created_at_time']).dt.days
    clicks['click_age_days'] = (max_time - clicks['click_time']).dt.days
    
    # Log-transform to reduce skew
    df['log_words_count'] = np.log1p(df['words_count'])
    df = df.drop(columns=['words_count'])

    # --- Popularity features (from clicks) ---
    article_clicks = clicks.groupby('click_article_id').agg(
        clicks=('user_id', 'count'),
        unique_users=('user_id', 'nunique'),
        avg_session_size=('session_size', 'mean'),
        last_click_days_ago=('click_age_days', 'min')
    ).reset_index()

    # Merge click stats
    df = df.merge(article_clicks, how='left', left_on='article_id', right_on='click_article_id')
    df.drop(columns=['created_at_ts', 'created_at_time', 'click_article_id',], inplace=True)

    # Fill missing popularity for cold-start articles
    for col in ['clicks', 'unique_users', 'avg_session_size']:
        df[col] = df[col].fillna(0)
    df.loc[df['last_click_days_ago'].isna(), 'last_click_days_ago'] = df.loc[df['last_click_days_ago'].isna(), 'article_age_days']

    df['article_age_weight'] = get_recency_weight(df['article_age_days'] )
    df['last_click_age_weight'] = get_recency_weight(df['last_click_days_ago'] )
    return df.drop(columns=['article_age_days', 'last_click_days_ago'])

  df[col] = df[col].fillna(0)


Unnamed: 0,article_id,category_id,log_words_count,clicks,unique_users,avg_session_size,article_age_weight,last_click_age_weight
0,0,0,5.129899,0.0,0.0,0.0,0.010989,0.010989
1,1,1,5.247024,0.0,0.0,0.0,0.000747,0.000747
2,2,1,5.525453,0.0,0.0,0.0,0.000769,0.000769
3,3,1,5.442418,1.0,1.0,2.0,0.000768,0.006452
4,4,1,5.09375,0.0,0.0,0.0,0.000759,0.000759


### Features utilisateurs & interactions

In [None]:
def build_interactions_features(clicks_df):
    df = clicks_df.copy()
    df['click_time'] = pd.to_datetime(df['click_timestamp'], unit='ms')

    max_time = df['click_time'].max()
    df['click_age_days'] = (max_time - df['click_time']).dt.days
    df["recency_weight"] = get_recency_weight(df['click_age_days'])

    user_item_interactions = (
        df.drop(columns=['click_timestamp']).groupby(['user_id', 'click_article_id'])
        .agg(click_count=('click_article_id', 'count'),
             last_click_days_ago=('click_age_days', 'min'),
             recency_weight=('recency_weight', 'sum'))
        .reset_index()
    )
    # Final interaction weight (tune as needed)
    user_item_interactions["interaction_weight"] = (
        user_item_interactions["click_count"] + user_item_interactions["recency_weight"]
    )
    return user_item_interactions


def build_user_features(clicks_df, articles_df):
    df = clicks_df.copy()
    df['click_time'] = pd.to_datetime(df['click_timestamp'], unit='ms')
    interactions = build_interactions_features(df)

    df = df.merge(interactions, how='left', on=['user_id', 'click_article_id'])

    # Aggréger les interactions des utilisateurs
    user_features = (
        df.merge(articles_df, how='left', left_on='click_article_id', right_on='article_id')
        .drop(columns='article_id')
        .groupby("user_id").agg(
            total_clicks=("click_article_id", "count"), # nombre de clicks
            total_sessions=("session_id", "nunique"), # nombre de sessions uniques
            avg_session_size=("session_size", "mean"), # duree moyenne de la session
            unique_articles=("click_article_id", "nunique"), # nombre d'articles distincts consultés
            unique_categories=("category_id", "nunique"), # nombre de catégories disctinctes consultées
            last_click_recency_weight=("recency_weight", "min") # date de la dernière intéraction
    ).reset_index())

    # Heures habituelles de consultation du site
    df["hour"] = df["click_time"].dt.hour
    user_pref_hour = (
        df.groupby("user_id")["hour"]
        .apply(lambda x: x.mode().iloc[0] if not x.mode().empty else -1)
        .reset_index(name="preferred_hour")
    )
    user_features = user_features.merge(user_pref_hour, on="user_id", how="left")

    # Appareils les plus frequemment utilisés
    def most_common(series):
        return series.mode().iloc[0] if not series.mode().empty else -1

    user_device = df.groupby("user_id")["click_deviceGroup"].agg(most_common).reset_index(name="main_device")
    user_os = df.groupby("user_id")["click_os"].agg(most_common).reset_index(name="main_os")

    # Merge everything
    user_features = (
        user_features
        .merge(user_device, on="user_id", how="left")
        .merge(user_os, on="user_id", how="left")
    )

    user_interactions_features = user_features.merge(user_item_interactions, how='left', on='user_id')
    return user_interactions_features

In [None]:
article_features = build_article_features(articles_df, clicks_df)
article_features.head()

In [18]:
user_interactions_features = build_user_features(clicks_df, articles_df)
user_interactions_features.head()

NameError: name 'user_item_interactions' is not defined

In [184]:
features = user_interactions_features.merge(article_features, how='left', left_on='click_article_id', right_on='article_id').drop(columns=['article_id'])
id_cols = ['user_id', 'click_article_id']

num_cols = [
    'log_words_count', 'clicks', 'unique_users',
    'article_age_weight', 'last_click_age_weight', 'total_clicks',
    'total_sessions', 'avg_session_size_x', 'avg_session_size_y', 'unique_articles',
    'unique_categories', 'last_click_recency_weight',
    'click_count', 'recency_weight', 'interaction_weight'
]

cat_cols = ['preferred_hour', 'main_device', 'main_os', 'category_id']

scaler = MinMaxScaler()
num_cols_scaled_df = pd.DataFrame(scaler.fit_transform(features[num_cols]), columns=num_cols)

oh_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
oh_encoded_features = oh_encoder.fit_transform(features[cat_cols])
oh_encoded_features_df = pd.DataFrame(oh_encoded_features, columns=oh_encoder.get_feature_names_out(cat_cols))

features = pd.concat([features[id_cols].reset_index(drop=True),
                     num_cols_scaled_df.reset_index(drop=True),
                     oh_encoded_features_df.reset_index(drop=True)], axis=1)

print("Shape:", features.shape)
print("Features:", features.columns)
features.head(1)

Shape: (2950710, 369)
Features: Index(['user_id', 'click_article_id', 'log_words_count', 'clicks',
       'unique_users', 'article_age_weight', 'last_click_age_weight',
       'total_clicks', 'total_sessions', 'avg_session_size_x',
       ...
       'category_id_448', 'category_id_449', 'category_id_450',
       'category_id_451', 'category_id_453', 'category_id_454',
       'category_id_455', 'category_id_456', 'category_id_458',
       'category_id_460'],
      dtype='object', length=369)


Unnamed: 0,user_id,click_article_id,log_words_count,clicks,unique_users,article_age_weight,last_click_age_weight,total_clicks,total_sessions,avg_session_size_x,...,category_id_448,category_id_449,category_id_450,category_id_451,category_id_453,category_id_454,category_id_455,category_id_456,category_id_458,category_id_460
0,0,68866,0.615875,0.040283,0.043639,0.717412,0.03367,0.004878,0.014563,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
