# Chargement des librairies

In [1]:
import os
import sys

import logging
import random
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

!pip install plotly
import plotly.express as px
import plotly.io as pio

from pandas.api.types import is_numeric_dtype
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import StandardScaler

!pip install surprise
from surprise import (
    NMF,
    SVD,
    BaselineOnly,
    CoClustering,
    Dataset,
    KNNBaseline,
    KNNBasic,
    KNNWithMeans,
    KNNWithZScore,
    NormalPredictor,
    Reader,
    SlopeOne,
    SVDpp,
)
from surprise.model_selection import (
    GridSearchCV,
    cross_validate,
    train_test_split,
)
from tqdm import tqdm

pio.renderers.default = "iframe"

# Plotly as Pandas plotting backend
pd.options.plotting.backend = "plotly"

# Surprise ratings reader
reader = Reader(rating_scale=(0, 1))

Collecting plotly
  Downloading plotly-5.11.0-py2.py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tenacity>=6.2.0
  Downloading tenacity-8.1.0-py3-none-any.whl (23 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.11.0 tenacity-8.1.0
[0mCollecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-linux_x86_64.whl size=3194051 sha256=e305e4b1b6a2d349a85dfbf

# Fonctions utilitaires

In [2]:
def reduce_dataframe_memory_usage(
    df: pd.DataFrame,
    high_precision: bool = False,
) -> pd.DataFrame:
    """
    Iterate through all the columns of a dataframe and modify the data type to
    reduce memory usage.
    Args:
        df (pd.DataFrame): dataframe to reduce memory usage.
        high_precision (bool): If True, use 64-bit floats instead of 32-bit
    Returns:
        pd.DataFrame: dataframe with reduced memory usage.
    """
    start_mem = round(df.memory_usage().sum() / 1024**2, 2)
    logging.info("Memory usage of dataframe is %d MB", start_mem)

    # Iterate through columns
    for col in df.columns:
        if df[col].dtype == "object":
            # "object" dtype
            if df[col].nunique() < max(100, df.shape[0] / 100):
                # If number of unique values is less than max(100, 1%)
                df[col] = df[col].astype("category")
            else:
                # If number of unique values is greater than max(100, 1%)
                df[col] = df[col].astype("string")

        elif str(df[col].dtype)[:3] == "int":
            # "int" dtype
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                df[col] = df[col].astype("UInt8")
            elif c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype("Int8")
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                df[col] = df[col].astype("UInt16")
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype("Int16")
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                df[col] = df[col].astype("UInt32")
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype("Int32")
            elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                df[col] = df[col].astype("UInt64")
            else:
                df[col] = df[col].astype("Int64")

        elif str(df[col].dtype)[:5] == "float":
            # "float" dtype
            c_min = df[col].min()
            c_max = df[col].max()
            if (
                not high_precision
                and c_min > np.finfo(np.float32).min
                and c_max < np.finfo(np.float32).max
            ):
                df[col] = df[col].astype("float32")
            else:
                df[col] = df[col].astype("float64")

    end_mem = round(df.memory_usage().sum() / 1024**2, 2)
    logging.info("Memory usage after optimization is %d MB", end_mem)
    if start_mem > 0:
        logging.info(
            "Decreased by %d %%", round(100 * (start_mem - end_mem) / start_mem)
        )

    return df

# Chargement des fichiers

In [3]:
DATA_PATH = "/storage/P9"

CACHE = dict()

NUM_EMBEDDINGS = 250

articles = pd.concat(
    [
        pd.read_csv(
            Path(DATA_PATH, "articles_metadata.csv"),
            parse_dates=["created_at_ts"],
            date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
            dtype={
                "article_id": "category",
                "category_id": "category",
                "publisher_id": "category",
                "words_count": "int",
            },
        ),
        pd.DataFrame(
            pd.read_pickle(Path(DATA_PATH, "articles_embeddings.pickle")),
            columns=["embedding_" + str(i) for i in range(NUM_EMBEDDINGS)],
        ),
    ],
    axis=1,
)

articles = reduce_dataframe_memory_usage(
    articles.astype({"created_at_ts": "datetime64[ns]"})
)

articles_sample = articles.sample(frac=0.01, random_state=42)

articles.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
count,364047.0,364047.0,364047,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,...,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0
unique,364047.0,461.0,,1.0,,,,,,,...,,,,,,,,,,
top,0.0,281.0,,0.0,,,,,,,...,,,,,,,,,,
freq,1.0,12817.0,,364047.0,,,,,,,...,,,,,,,,,,
mean,,,2016-09-16 23:57:17.328421888,,190.897727,-0.238645,-0.963335,0.118548,-0.279295,-0.068579,...,-0.133287,-0.081912,-0.060347,0.023003,0.076947,0.084603,0.062819,0.099768,0.155917,-0.041094
min,,,2006-09-27 11:14:35,,0.0,-0.991183,-0.996455,-0.968431,-0.994966,-0.994489,...,-0.990412,-0.989408,-0.990432,-0.993626,-0.989042,-0.996902,-0.992921,-0.984733,-0.976071,-0.988213
25%,,,2015-10-15 16:00:43.500000,,159.0,-0.620072,-0.974056,-0.289953,-0.718816,-0.503425,...,-0.547684,-0.445079,-0.479989,-0.404508,-0.248653,-0.267072,-0.306548,-0.313598,-0.201402,-0.420694
50%,,,2017-03-13 16:27:29,,186.0,-0.302581,-0.967605,0.124339,-0.391535,-0.093734,...,-0.175781,-0.094113,-0.078034,0.000726,0.105649,0.133525,0.083315,0.128757,0.188355,-0.015232
75%,,,2017-11-05 14:09:11,,218.0,0.098015,-0.959061,0.545112,0.10832,0.345024,...,0.250641,0.270006,0.341105,0.459386,0.417347,0.461466,0.441831,0.531453,0.538111,0.334226
max,,,2018-03-13 12:12:30,,6690.0,0.983694,-0.514728,0.998341,0.978092,0.996798,...,0.996401,0.981789,0.991332,0.995299,0.978823,0.989324,0.991445,0.997583,0.990507,0.968462


In [4]:
clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(Path(DATA_PATH, "clicks").glob("clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = reduce_dataframe_memory_usage(
    clicks.astype(
        {"session_start": "datetime64[ns]", "click_timestamp": "datetime64[ns]"}
    )
)

clicks.describe(include="all", datetime_is_numeric=True)

100%|██████████| 385/385 [00:43<00:00,  8.87it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 14:17:08.013157120,3.901885,,2017-10-08 14:51:05.106516736,,,,,,
min,,,2017-10-01 02:37:03,2.0,,2017-10-01 03:00:00,,,,,,
25%,,,2017-10-04 13:35:52,2.0,,2017-10-04 14:20:52,,,,,,
50%,,,2017-10-08 20:09:00,3.0,,2017-10-08 20:35:30,,,,,,
75%,,,2017-10-11 19:16:54,4.0,,2017-10-11 19:43:24,,,,,,
max,,,2017-10-17 03:36:19,124.0,,2017-11-13 20:04:14,,,,,,


# Modélisation

## Création de la mesure ratings

In [5]:
def get_ratings_from_clicks(clicks):
    count_user_article_clicks = (
        clicks.reset_index()
        .groupby(["user_id", "click_article_id"])
        .agg(
            COUNT_user_article_clicks=("index", "count"),
        )
    )

    count_user_clicks = (
        clicks.reset_index()
        .groupby(["user_id"])
        .agg(
            COUNT_user_clicks=("index", "count"),
        )
    )

    ratings = count_user_article_clicks.join(count_user_clicks, on="user_id")
    ratings["rating"] = (
        ratings["COUNT_user_article_clicks"] / ratings["COUNT_user_clicks"]
    )

    ratings = reduce_dataframe_memory_usage(
        ratings["rating"]
        .reset_index()
        .rename({"click_article_id": "article_id"}, axis=1)
    )

    return ratings


ratings = get_ratings_from_clicks(clicks)

ratings_sample = ratings.sample(frac=0.01, random_state=42)

ratings

Unnamed: 0,user_id,article_id,rating
0,0,157541,0.125000
1,0,160158,0.125000
2,0,233470,0.125000
3,0,313996,0.125000
4,0,68866,0.125000
...,...,...,...
2950705,99998,64329,0.071429
2950706,99999,168784,0.250000
2950707,99999,225055,0.250000
2950708,99999,272143,0.250000


## Recherche du meilleur modèle avec cross validation

In [6]:
sample_frac = 0.5

data = Dataset.load_from_df(ratings.sample(frac=sample_frac, random_state=42), reader)

for algo in [
    NormalPredictor(),  # bad results
    BaselineOnly(),
    #KNNBasic(), # uses too much memory
    #KNNWithMeans(), # uses too much memory
    #KNNWithZScore(), # uses too much memory
    #KNNBaseline(), # uses too much memory
    SVD(),
    SVDpp(),
    NMF(),  # long training time
    SlopeOne(),  # uses too much memory
    CoClustering(),  #
]:
    print("v" * (len(algo.__class__.__name__) + 4))
    print("> " + algo.__class__.__name__ + " <")
    print("^" * (len(algo.__class__.__name__) + 4))

    # Run 5-fold cross-validation and print results
    cross_validate(algo, data, verbose=True)
    print()

vvvvvvvvvvvvvvvvvvv
> NormalPredictor <
^^^^^^^^^^^^^^^^^^^
Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1694  0.1691  0.1691  0.1694  0.1692  0.1692  0.0001  
MAE (testset)     0.1268  0.1265  0.1266  0.1267  0.1268  0.1267  0.0001  
Fit time          2.72    3.50    3.51    3.46    3.49    3.34    0.31    
Test time         3.25    3.96    3.55    3.38    3.32    3.49    0.26    

vvvvvvvvvvvvvvvv
> BaselineOnly <
^^^^^^^^^^^^^^^^
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1095  0.1093  0.1094  0.1094  0.1089  0.1093  0.0002  
MAE (testset)     0.0713  0.0712  0.0712  0.0712  0.0710  0.0712  0.0001  
Fit 

## Optimisation d'hyper-paramètres pour le meilleur modèle

In [7]:
sample_frac = 0.5

data = Dataset.load_from_df(ratings.sample(frac=sample_frac, random_state=42), reader)


param_grid = {
    "n_cltr_u": [5, 10, 15, 20, 50],  # 100
    "n_cltr_i": [5, 10, 15, 25, 50, 100],  # 100
    "n_epochs": [20, 50, 100],  # 20
}
grid = GridSearchCV(
    CoClustering,
    param_grid,
    measures=["rmse", "mae"],
    cv=3,
    refit=True,
    return_train_measures=True,
    n_jobs=-2,
    joblib_verbose=9,
)

grid.fit(data)

# best RMSE score
print(grid.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(grid.best_params["rmse"])

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=-2)]: Done   8 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-2)]: Done  22 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-2)]: Done  40 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-2)]: Done  62 tasks      | elapsed: 36.6min
[Parallel(n_jobs=-2)]: Done  88 tasks      | elapsed: 50.6min
[Parallel(n_jobs=-2)]: Done 118 tasks      | elapsed: 73.7min
[Parallel(n_jobs=-2)]: Done 152 tasks      | elapsed: 96.3min
[Parallel(n_jobs=-2)]: Done 190 tasks      | elapsed: 124.8min
[Parallel(n_jobs=-2)]: Done 232 tasks      | elapsed: 162.6min
[Parallel(n_jobs=-2)]: Done 270 out of 270 | elapsed: 212.9min finished


0.1498692011235804
{'n_cltr_u': 5, 'n_cltr_i': 5, 'n_epochs': 20}


In [6]:
sample_frac = 0.5

data = Dataset.load_from_df(ratings.sample(frac=sample_frac, random_state=42), reader)


param_grid = {
    "n_cltr_u": [5, 6, 7, 8, 9],  # 100
    "n_cltr_i": [5, 6, 7, 8, 9],  # 100
    "n_epochs": [20, 50, 100],  # 20
}
grid2 = GridSearchCV(
    CoClustering,
    param_grid,
    measures=["rmse", "mae"],
    cv=3,
    refit=True,
    return_train_measures=True,
    n_jobs=-2,
    joblib_verbose=9,
)

grid2.fit(data)

# best RMSE score
print(grid2.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(grid2.best_params["rmse"])

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=-2)]: Done   8 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-2)]: Done  22 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-2)]: Done  40 tasks      | elapsed: 19.1min
[Parallel(n_jobs=-2)]: Done  62 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-2)]: Done  88 tasks      | elapsed: 43.3min
[Parallel(n_jobs=-2)]: Done 118 tasks      | elapsed: 58.9min
[Parallel(n_jobs=-2)]: Done 152 tasks      | elapsed: 76.6min
[Parallel(n_jobs=-2)]: Done 190 tasks      | elapsed: 96.7min
[Parallel(n_jobs=-2)]: Done 225 out of 225 | elapsed: 116.5min finished


0.1497741882401353
{'n_cltr_u': 5, 'n_cltr_i': 5, 'n_epochs': 20}


## Fonctions de modélisation

In [7]:
def aggregate_articles(articles):
    return articles.groupby(lambda x: True).agg(
        {
            col: "mean"
            if is_numeric_dtype(articles.dtypes[col])
            else lambda x: x.mode()[0]
            for col in articles.columns
        }
    )


def get_user_interest(user_id, clicks, articles, strategy="last_click"):
    user_id = str(user_id)

    if strategy == "last_click":
        last_clicked_article_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["click_article_id"]
        )
        interest = articles.query("article_id == @last_clicked_article_id")

    elif strategy == "last_session":
        last_session_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["session_id"]
        )
        session_article_ids = clicks.query("session_id == @last_session_id")[
            "click_article_id"
        ]
        interest = aggregate_articles(
            articles.query("article_id in @session_article_ids")
        ).drop(["article_id"], axis=1)

    elif strategy == "all_clicks":
        all_article_ids = clicks.query("user_id == @user_id")["click_article_id"]
        interest = aggregate_articles(
            articles.query("article_id in @all_article_ids")
        ).drop(["article_id"], axis=1)

    else:
        raise NotImplementedError

    return interest


def prepare_for_scale(articles, category_id):
    articles_copy = articles.drop(["article_id", "similarity"], axis=1, errors="ignore")
    articles_copy["category_id"] = articles_copy["category_id"].apply(
        lambda x: category_id if int(x) == category_id else 0
    )
    articles_copy["created_at_ts"] = articles_copy["created_at_ts"].apply(
        lambda x: x.value
    )

    return articles_copy


def get_closest_articles(interest, articles, n=10):
    category_id = interest["category_id"].iloc[0]

    scaler = StandardScaler()
    articles_std = scaler.fit_transform(prepare_for_scale(articles, category_id))
    interest_std = scaler.transform(prepare_for_scale(interest, category_id))

    articles = articles.copy()
    articles["similarity"] = cosine_similarity(interest_std, articles_std)[0]

    return (
        articles.sort_values("similarity", ascending=False).iloc[:n],
        scaler,
        articles_std,
        interest_std,
    )


def get_collaborative_reco(user_id, model, articles, n=10):
    return list(
        pd.DataFrame(
            [
                (
                    lambda p: {
                        "article_id": p.iid,
                        "prediction": p.est,
                    }
                )(model.predict(uid=user_id, iid=article_id))
                for article_id in articles["article_id"]
            ],
            columns=["article_id", "prediction"],
        )
        .sort_values(by="prediction", ascending=False)
        .reset_index(drop=True)["article_id"]
    )[:n]

## Entrainement des modèles sur le dataset complet

In [8]:
model_baseline_only = BaselineOnly(verbose=True).fit(
    Dataset.load_from_df(ratings, reader).build_full_trainset()
)
model_coclustering = CoClustering(**grid2.best_params["rmse"], random_state=42, verbose=True).fit(
    Dataset.load_from_df(ratings, reader).build_full_trainset()
)

Estimating biases using als...
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


## Préparation des données de prédiction

In [9]:
user_id = "5890"

interest = get_user_interest(user_id, clicks, articles, strategy="all_clicks")
category_id = interest["category_id"].iloc[0]

closest_article_ids = get_collaborative_reco(user_id, model_baseline_only, articles)

scaler = StandardScaler()
articles_std = scaler.fit_transform(prepare_for_scale(articles, category_id))
articles_sample_std = scaler.transform(prepare_for_scale(articles_sample, category_id))

interest_std = scaler.transform(prepare_for_scale(interest, category_id))

closest_articles = articles[articles["article_id"].isin(closest_article_ids)]
closest_articles_std = scaler.transform(
    prepare_for_scale(closest_articles, category_id)
)

## Visualisations des données

In [10]:
pca = PCA(n_components=2)
articles_pca = pca.fit_transform(articles_sample_std)
interest_pca = pca.transform(interest_std)
closest_articles_pca = pca.transform(closest_articles_std)


# Plot the data in the PCA space
fig = px.scatter(
    x=articles_pca[:, 0],
    y=articles_pca[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="PCA 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_pca[:, 0],
    y=interest_pca[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_pca[:, 0],
    y=closest_articles_pca[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_pca))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()

In [11]:
tsne = TSNE(n_components=2)
articles_tsne = tsne.fit_transform(
    np.concatenate((articles_sample_std, closest_articles_std, interest_std))
)

interest_tsne = articles_tsne[-1:]
articles_tsne = articles_tsne[:-1]

closest_articles_tsne = articles_tsne[-len(closest_articles) :]
articles_tsne = articles_tsne[: -len(closest_articles)]


# Plot the data in the t-SNE space
fig = px.scatter(
    x=articles_tsne[:, 0],
    y=articles_tsne[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="t-SNE 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_tsne[:, 0],
    y=interest_tsne[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_tsne[:, 0],
    y=closest_articles_tsne[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_tsne))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



## Préparationd des datasets d'entrainement et de test

In [12]:
users_last_click = (
    clicks.reset_index()
    .rename(columns={"index": "click_id"})
    .sort_values(by="click_timestamp")
    .groupby(["user_id"])
    .last()
)

X = get_ratings_from_clicks(clicks.drop(list(users_last_click["click_id"])))
y_true = dict(users_last_click["click_article_id"])

test_sample = random.sample(list(y_true.keys()), k=100)

## Calcul des prédiction du prochain article

In [13]:
model_baseline_only = model_baseline_only.fit(
    Dataset.load_from_df(X, reader).build_full_trainset()
)
y_pred_baseline_only = dict(
    {
        user_id: list(
            pd.DataFrame(
                [
                    (
                        lambda p: {
                            "article_id": p.iid,
                            "prediction": p.est,
                        }
                    )(model_baseline_only.predict(uid=user_id, iid=article_id))
                    for article_id in articles["article_id"]
                ],
                columns=["article_id", "prediction"],
            )
            .sort_values(by="prediction", ascending=False)
            .reset_index(drop=True)["article_id"]
        )
        for user_id in tqdm(test_sample)
    }
)

Estimating biases using als...


100%|██████████| 100/100 [03:55<00:00,  2.36s/it]


In [15]:
model_coclustering = model_coclustering.fit(
    Dataset.load_from_df(X, reader).build_full_trainset()
)
y_pred_coclustering = dict(
    {
        user_id: list(
            pd.DataFrame(
                [
                    (
                        lambda p: {
                            "article_id": p.iid,
                            "prediction": p.est,
                        }
                    )(model_coclustering.predict(uid=user_id, iid=article_id))
                    for article_id in articles["article_id"]
                ],
                columns=["article_id", "prediction"],
            )
            .sort_values(by="prediction", ascending=False)
            .reset_index(drop=True)["article_id"]
        )
        for user_id in tqdm(test_sample)
    }
)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


100%|██████████| 100/100 [03:49<00:00,  2.29s/it]


## Fonctions d'évaluation

In [16]:
def score_reco(y_true, y_pred):
    score = 0
    count = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.info(f"User {user_id} not found in true values")
            continue

        true_article_id = str(y_true[user_id])

        if true_article_id not in pred_article_ids:
            logging.info(
                f"Article {true_article_id} not found in predictions for user {user_id}"
            )
            continue

        rank = pred_article_ids.index(true_article_id) + 1
        score += 1 / rank
        count += 1

    # In range [0 , 1], higher is better
    return score / count


def mean_rank(y_true, y_pred):
    sum = 0
    count = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.info(f"User {user_id} not found in true values")
            continue

        true_article_id = str(y_true[user_id])

        if true_article_id not in pred_article_ids:
            logging.info(
                f"Article {true_article_id} not found in predictions for user {user_id}"
            )
            continue

        rank = pred_article_ids.index(true_article_id) + 1
        sum += rank
        count += 1

    # In range [1 , +Inf[, lower is better
    return sum / count


def mean_average_precision(y_true, y_pred, articles, k=10):
    average_precision = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.warning(f"User {user_id} not found in true values")
            continue

        true_category_id = articles.iloc[int(y_true[user_id])].category_id
        pred_categories = articles.iloc[
            [int(id) for id in pred_article_ids[:k]]
        ].category_id

        average_precision = (
            len(pred_categories[pred_categories == true_category_id]) / k
        )

    return average_precision / len(y_pred)

## Evaluations des modèles

In [17]:
print(f"Score : {score_reco(y_true, y_pred_baseline_only)}")
print(f"Mean Rank : {mean_rank(y_true, y_pred_baseline_only)}")
print(
    f"Mean Average Precision : {mean_average_precision(y_true, y_pred_baseline_only, articles, k=1000)}"
)

Score : 0.0005816101420071275
Mean Rank : 7164.78
Mean Average Precision : 0.00025


In [18]:
print(f"Score : {score_reco(y_true, y_pred_coclustering)}")
print(f"Mean Rank : {mean_rank(y_true, y_pred_coclustering)}")
print(
    f"Mean Average Precision : {mean_average_precision(y_true, y_pred_coclustering, articles, k=1000)}"
)

Score : 0.00010563481922186175
Mean Rank : 231184.76
Mean Average Precision : 0.00048
