# Import des librairies

In [1]:
import logging
import random
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

!pip install plotly
import plotly.express as px
import plotly.io as pio
from pandas.api.types import is_numeric_dtype
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

pio.renderers.default = "iframe"

pd.options.plotting.backend = "plotly"

[0m

# Chargement des fichiers

In [2]:
DATA_PATH = "/storage/P9"

CACHE = dict()

NUM_EMBEDDINGS = 250

articles = pd.concat(
    [
        pd.read_csv(
            Path(DATA_PATH, "articles_metadata.csv"),
            parse_dates=["created_at_ts"],
            date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
            dtype={
                "article_id": "category",
                "category_id": "category",
                "publisher_id": "category",
                "words_count": "int",
            },
        ),
        pd.DataFrame(
            pd.read_pickle(Path(DATA_PATH, "articles_embeddings.pickle")),
            columns=["embedding_" + str(i) for i in range(NUM_EMBEDDINGS)],
        ),
    ],
    axis=1,
)

articles = articles.astype({"created_at_ts": "datetime64[ns]"})

articles_sample = articles.sample(frac=0.01, random_state=42)

articles.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
count,364047.0,364047.0,364047,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,...,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0
unique,364047.0,461.0,,1.0,,,,,,,...,,,,,,,,,,
top,0.0,281.0,,0.0,,,,,,,...,,,,,,,,,,
freq,1.0,12817.0,,364047.0,,,,,,,...,,,,,,,,,,
mean,,,2016-09-16 23:57:17.328421888,,190.897727,-0.238645,-0.963335,0.118548,-0.279295,-0.068579,...,-0.133287,-0.081912,-0.060347,0.023003,0.076947,0.084603,0.062819,0.099768,0.155917,-0.041094
min,,,2006-09-27 11:14:35,,0.0,-0.991183,-0.996455,-0.968431,-0.994966,-0.994489,...,-0.990412,-0.989408,-0.990432,-0.993626,-0.989042,-0.996902,-0.992921,-0.984733,-0.976071,-0.988213
25%,,,2015-10-15 16:00:43.500000,,159.0,-0.620072,-0.974056,-0.289953,-0.718816,-0.503425,...,-0.547684,-0.445079,-0.479989,-0.404508,-0.248653,-0.267072,-0.306548,-0.313598,-0.201402,-0.420694
50%,,,2017-03-13 16:27:29,,186.0,-0.302581,-0.967605,0.124339,-0.391535,-0.093734,...,-0.175781,-0.094113,-0.078034,0.000726,0.105649,0.133525,0.083315,0.128757,0.188355,-0.015232
75%,,,2017-11-05 14:09:11,,218.0,0.098015,-0.959061,0.545112,0.10832,0.345024,...,0.250641,0.270006,0.341105,0.459386,0.417347,0.461466,0.441831,0.531453,0.538111,0.334226
max,,,2018-03-13 12:12:30,,6690.0,0.983694,-0.514728,0.998341,0.978092,0.996798,...,0.996401,0.981789,0.991332,0.995299,0.978823,0.989324,0.991445,0.997583,0.990507,0.968462


In [3]:
clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(Path(DATA_PATH, "clicks").glob("clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = clicks.astype(
    {"session_start": "datetime64[ns]", "click_timestamp": "datetime64[ns]"}
)

clicks.describe(include="all", datetime_is_numeric=True)

100%|██████████| 385/385 [00:36<00:00, 10.43it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 14:17:08.013157120,3.901885,,2017-10-08 14:51:05.106516736,,,,,,
min,,,2017-10-01 02:37:03,2.0,,2017-10-01 03:00:00,,,,,,
25%,,,2017-10-04 13:35:52,2.0,,2017-10-04 14:20:52,,,,,,
50%,,,2017-10-08 20:09:00,3.0,,2017-10-08 20:35:30,,,,,,
75%,,,2017-10-11 19:16:54,4.0,,2017-10-11 19:43:24,,,,,,
max,,,2017-10-17 03:36:19,124.0,,2017-11-13 20:04:14,,,,,,


# Content extraction

In [4]:
def aggregate_articles(articles):
    return articles.groupby(lambda x: True).agg(
        {
            col: "mean"
            if is_numeric_dtype(articles.dtypes[col])
            else lambda x: x.mode()[0]
            for col in articles.columns
        }
    )


def get_user_interest(user_id, clicks, articles, strategy="last_click"):
    user_id = str(user_id)

    if strategy == "last_click":
        last_clicked_article_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["click_article_id"]
        )
        interest = articles.query("article_id == @last_clicked_article_id")

    elif strategy == "last_session":
        last_session_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["session_id"]
        )
        session_article_ids = clicks.query("session_id == @last_session_id")[
            "click_article_id"
        ]
        interest = aggregate_articles(
            articles.query("article_id in @session_article_ids")
        ).drop(["article_id"], axis=1)

    elif strategy == "all_clicks":
        all_article_ids = clicks.query("user_id == @user_id")["click_article_id"]
        interest = aggregate_articles(
            articles.query("article_id in @all_article_ids")
        ).drop(["article_id"], axis=1)

    else:
        raise NotImplementedError

    return interest


def prepare_for_scale(articles, category_id):
    articles_copy = articles.drop(["article_id", "similarity"], axis=1, errors="ignore")
    articles_copy["category_id"] = articles_copy["category_id"].apply(
        lambda x: category_id if int(x) == category_id else 0
    )
    articles_copy["created_at_ts"] = articles_copy["created_at_ts"].apply(
        lambda x: x.value
    )

    return articles_copy


def get_closest_articles(interest, articles, n=10):
    category_id = interest["category_id"].iloc[0]

    scaler = StandardScaler()
    articles_std = scaler.fit_transform(prepare_for_scale(articles, category_id))
    interest_std = scaler.transform(prepare_for_scale(interest, category_id))

    articles = articles.copy()
    articles["similarity"] = cosine_similarity(interest_std, articles_std)[0]

    return (
        articles.sort_values("similarity", ascending=False).iloc[:n],
        scaler,
        articles_std,
        interest_std,
    )

In [5]:
user_id = "5890"

interest = get_user_interest(user_id, clicks, articles, strategy="all_clicks")
category_id = interest["category_id"].iloc[0]

closest_articles, scaler, articles_std, interest_std = get_closest_articles(
    interest, articles
)

articles_sample_std = scaler.transform(prepare_for_scale(articles_sample, category_id))
closest_articles_std = scaler.transform(
    prepare_for_scale(closest_articles, category_id)
)


closest_articles.head()

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,...,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249,similarity
202250,202250,327,2018-02-14 16:17:02,0,218,-0.539294,-0.972684,-0.348265,0.16128,-0.615844,...,0.347354,0.728034,-0.536209,0.442132,0.794489,-0.768416,0.728018,0.395049,-0.238701,0.868299
208306,208306,331,2018-01-16 16:39:53,0,177,-0.171999,-0.957977,-0.463457,0.465298,-0.858837,...,0.670115,0.800575,-0.120638,0.178502,0.848919,-0.705878,0.865663,-0.333375,0.066213,0.860416
209358,209358,333,2017-08-21 18:03:31,0,217,-0.481571,-0.968075,-0.401126,0.259045,-0.753278,...,0.266409,0.766705,-0.526819,0.37726,0.851084,-0.658042,0.847837,0.394801,0.186557,0.859104
206898,206898,331,2018-02-23 08:03:10,0,200,-0.26401,-0.976174,-0.580707,0.436864,-0.880156,...,0.517933,0.684472,-0.653533,0.1772,0.696423,-0.570849,0.850486,0.022779,-0.088364,0.854569
202819,202819,327,2017-05-28 15:16:55,0,195,-0.322037,-0.971334,-0.134832,0.447763,-0.701718,...,0.634966,0.627765,-0.668423,0.53626,0.900776,-0.825667,0.88949,0.362369,0.615859,0.853787


In [6]:
pca = PCA(n_components=2)
articles_pca = pca.fit_transform(articles_sample_std)
interest_pca = pca.transform(interest_std)
closest_articles_pca = pca.transform(closest_articles_std)


# Plot the data in the PCA space
fig = px.scatter(
    x=articles_pca[:, 0],
    y=articles_pca[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="PCA 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_pca[:, 0],
    y=interest_pca[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_pca[:, 0],
    y=closest_articles_pca[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_pca))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()


In [11]:
pca.explained_variance_ratio_

array([0.09171546, 0.06764608])

In [7]:
tsne = TSNE(n_components=2)
articles_tsne = tsne.fit_transform(
    np.concatenate((articles_sample_std, closest_articles_std, interest_std))
)

interest_tsne = articles_tsne[-1:]
articles_tsne = articles_tsne[:-1]

closest_articles_tsne = articles_tsne[-len(closest_articles) :]
articles_tsne = articles_tsne[: -len(closest_articles)]


# Plot the data in the t-SNE space
fig = px.scatter(
    x=articles_tsne[:, 0],
    y=articles_tsne[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="t-SNE 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_tsne[:, 0],
    y=interest_tsne[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_tsne[:, 0],
    y=closest_articles_tsne[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_tsne))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



# Modélisation

## Préparation des dataset train / test

In [14]:
users_last_click = (
    clicks.reset_index()
    .rename(columns={"index": "click_id"})
    .sort_values(by="click_timestamp")
    .groupby(["user_id"])
    .last()
)

X = clicks.drop(list(users_last_click["click_id"]))
y_true = dict(users_last_click["click_article_id"])

test_sample = random.sample(list(y_true.keys()), k=100)

## Prédiction

In [15]:
y_pred_last_click = {
    user_id: list(
        get_closest_articles(
            get_user_interest(user_id, X, articles, strategy="last_click"),
            articles,
            n=1000,
        )[0]["article_id"]
    )
    for user_id in tqdm(test_sample)
}

100%|██████████| 100/100 [44:15<00:00, 26.56s/it]


In [16]:
y_pred_last_session = {
    user_id: list(
        get_closest_articles(
            get_user_interest(user_id, X, articles, strategy="last_session"),
            articles,
            n=1000,
        )[0]["article_id"]
    )
    for user_id in tqdm(test_sample)
}

100%|██████████| 100/100 [45:04<00:00, 27.05s/it]


In [17]:
y_pred_all_clicks = {
    user_id: list(
        get_closest_articles(
            get_user_interest(user_id, X, articles, strategy="all_clicks"),
            articles,
            n=1000,
        )[0]["article_id"]
    )
    for user_id in tqdm(test_sample)
}

100%|██████████| 100/100 [45:04<00:00, 27.04s/it]


In [18]:
def score_reco(y_true, y_pred):
    score = 0
    count = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.info(f"User {user_id} not found in true values")
            continue

        true_article_id = str(y_true[user_id])

        if true_article_id not in pred_article_ids:
            logging.info(
                f"Article {true_article_id} not found in predictions for user {user_id}"
            )
            continue

        rank = pred_article_ids.index(true_article_id) + 1
        score += 1 / rank
        count += 1

    # In range [0 , 1], higher is better
    return score / count


def mean_rank(y_true, y_pred):
    sum = 0
    count = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.info(f"User {user_id} not found in true values")
            continue

        true_article_id = str(y_true[user_id])

        if true_article_id not in pred_article_ids:
            logging.info(
                f"Article {true_article_id} not found in predictions for user {user_id}"
            )
            continue

        rank = pred_article_ids.index(true_article_id) + 1
        sum += rank
        count += 1

    # In range [1 , +Inf[, lower is better
    return sum / count


def mean_average_precision(y_true, y_pred, articles, k=10):
    average_precision = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.warning(f"User {user_id} not found in true values")
            continue

        true_category_id = articles.iloc[int(y_true[user_id])].category_id
        pred_categories = articles.iloc[
            [int(id) for id in pred_article_ids[:k]]
        ].category_id

        average_precision = (
            len(pred_categories[pred_categories == true_category_id]) / k
        )

    return average_precision / len(y_pred)

In [19]:
print(f"Score : {score_reco(y_true, y_pred_last_click)}")
print(f"Mean Rank : {mean_rank(y_true, y_pred_last_click)}")
print(
    f"Mean Average Precision : {mean_average_precision(y_true, y_pred_last_click, articles, k=1000)}"
)

Score : 0.0936263302238707
Mean Rank : 308.6666666666667
Mean Average Precision : 0.0


In [20]:
print(f"Score : {score_reco(y_true, y_pred_last_session)}")
print(f"Mean Rank : {mean_rank(y_true, y_pred_last_session)}")
print(
    f"Mean Average Precision : {mean_average_precision(y_true, y_pred_last_session, articles, k=1000)}"
)

Score : 0.07960817358962892
Mean Rank : 382.0
Mean Average Precision : 0.0


In [21]:
print(f"Score : {score_reco(y_true, y_pred_all_clicks)}")
print(f"Mean Rank : {mean_rank(y_true, y_pred_all_clicks)}")
print(
    f"Mean Average Precision : {mean_average_precision(y_true, y_pred_all_clicks, articles, k=1000)}"
)

Score : 0.018066000851006198
Mean Rank : 414.44444444444446
Mean Average Precision : 0.0
