# PROJET 09

## Content Based Recommmender Systems

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
ROOT_DIR = "/content/drive/MyDrive/OpenClassrooms/PROJET_09/"
#ROOT_DIR = "C:/Users/stela/Downloads/PROJET_09/"


from datetime import datetime
import random
import logging
import glob
import sys
import os
import pickle

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Math and ML
from math import floor
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

pd.options.plotting.backend = "plotly"

# Visualisation modules
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Helper functions
sys.path.append(ROOT_DIR) 
src_path = os.path.abspath(os.path.join(ROOT_DIR,"helpers/"))
if src_path not in sys.path:
    sys.path.append(src_path)
import helpers.data as data_utils

# data savings
if not os.path.exists(ROOT_DIR + "data/processed/"):
    os.mkdir(ROOT_DIR + "data/processed/")
# figures saving
if not os.path.exists(ROOT_DIR + "figures/"):
    os.mkdir(ROOT_DIR + "figures")
# results saving
if not os.path.exists(ROOT_DIR + "results/"):
    os.mkdir(ROOT_DIR + "results")

Mounted at /content/drive/


In [None]:
# Raw File path
filepath_data = os.path.join(ROOT_DIR,"data","raw")
clicks_folder = os.path.join(filepath_data, "clicks")
clicks_sample_file = os.path.join(filepath_data, "clicks_sample.csv")
articles_metadata_file = os.path.join(filepath_data, "articles_metadata.csv")
embedding_article_file = os.path.join(filepath_data, "articles_embeddings.pickle")
# Reading data
articles_metadata_df = pd.read_csv(articles_metadata_file, parse_dates=["created_at_ts"],
                                   date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
                                   dtype={
                                        "article_id": "category",
                                        "category_id": "category",
                                        "publisher_id": "category",
                                        "words_count": "int",
                                    },
                                    )
length_embedding = pd.DataFrame(pd.read_pickle(embedding_article_file)).shape[1]

articles_emb = pd.DataFrame(pd.read_pickle(embedding_article_file),
                            columns=["embedding_dim_" + str(i) for i in range(length_embedding)],)
# Reduction de dimensionalité de 250 à 72 dim : conserve une variance de 0.98 , 
pca = PCA(n_components=72)
pca.fit(articles_emb)
articles_emb_trans_pca = pca.transform(articles_emb)

articles = pd.concat([articles_metadata_df, 
                     pd.DataFrame(articles_emb_trans_pca, 
                                  columns=["embedding_dim_" + str(i) for i in range(articles_emb_trans_pca.shape[1])])
                     ],axis=1 ) 

articles = articles.astype({"created_at_ts": "datetime64[ns]"})

#articles_sample = articles.sample(frac=0.01, random_state=42)

articles.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count,embedding_dim_0,embedding_dim_1,embedding_dim_2,embedding_dim_3,embedding_dim_4,...,embedding_dim_62,embedding_dim_63,embedding_dim_64,embedding_dim_65,embedding_dim_66,embedding_dim_67,embedding_dim_68,embedding_dim_69,embedding_dim_70,embedding_dim_71
count,364047.0,364047.0,364047,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,...,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0
unique,364047.0,461.0,,1.0,,,,,,,...,,,,,,,,,,
top,0.0,281.0,,0.0,,,,,,,...,,,,,,,,,,
freq,1.0,12817.0,,364047.0,,,,,,,...,,,,,,,,,,
mean,,,2016-09-16 23:57:17.328421888,,190.897727,1e-06,-3.500685e-07,-1e-06,5.438804e-07,-2e-06,...,-2.403787e-08,4e-06,-9.631915e-07,-5.936747e-07,-3e-06,4e-06,4.211134e-07,-1e-06,2e-06,-6e-06
min,,,2006-09-27 11:14:35,,0.0,-6.469483,-5.40281,-5.003805,-4.324072,-6.045849,...,-1.075757,-1.161286,-1.092787,-1.053129,-1.02552,-0.987756,-0.915834,-0.906208,-0.874496,-1.00351
25%,,,2015-10-15 16:00:43.500000,,159.0,-1.683558,-1.380365,-1.303964,-1.093958,-1.019818,...,-0.1640515,-0.154173,-0.1543107,-0.1498754,-0.146489,-0.132207,-0.1288548,-0.123146,-0.122003,-0.118595
50%,,,2017-03-13 16:27:29,,186.0,-0.21792,-0.3074752,-0.255183,-0.2616372,-0.039663,...,-0.003544226,0.000132,0.0004474837,0.001350403,-0.001755,0.002033,0.0004867996,-0.001647,-0.000293,-0.000882
75%,,,2017-11-05 14:09:11,,218.0,1.665221,1.037089,1.203603,0.7663347,0.956362,...,0.1609942,0.153712,0.1545635,0.1505098,0.143408,0.133328,0.1284465,0.121247,0.121417,0.116661
max,,,2018-03-13 12:12:30,,6690.0,7.028433,7.105543,5.750128,7.331015,6.446866,...,1.33562,1.293567,1.332868,1.116385,1.265635,1.044319,0.9328506,0.945302,1.037014,1.006768


In [None]:
sum(pca.explained_variance_ratio_)

0.9785847861726567

In [None]:
articles

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count,embedding_dim_0,embedding_dim_1,embedding_dim_2,embedding_dim_3,embedding_dim_4,...,embedding_dim_62,embedding_dim_63,embedding_dim_64,embedding_dim_65,embedding_dim_66,embedding_dim_67,embedding_dim_68,embedding_dim_69,embedding_dim_70,embedding_dim_71
0,0,0,2017-12-13 05:53:39,0,168,-2.176781,-1.316916,-1.029052,0.901907,-1.809556,...,0.048989,0.058749,-0.320057,0.528145,0.111961,-0.421792,0.010406,0.062978,-0.297891,-0.154024
1,1,1,2014-07-14 12:45:36,0,189,-1.735177,0.489897,3.268564,0.087859,1.473054,...,-0.035589,0.419914,0.300691,-0.111774,-0.144771,0.199440,-0.181279,0.087767,-0.048058,0.084292
2,2,1,2014-08-22 00:35:06,0,250,-0.912690,-2.089337,1.865876,-1.202523,2.530590,...,0.192697,0.220001,0.142810,-0.135687,0.144284,-0.139567,0.062354,0.275921,-0.069368,0.225005
3,3,1,2014-08-19 17:11:53,0,230,1.096567,0.212962,4.183521,-0.649564,-0.130863,...,-0.371339,-0.105632,0.367136,0.295392,0.352472,-0.041173,-0.063291,-0.296480,0.025806,0.311312
4,4,1,2014-08-03 13:06:11,0,162,0.193783,-0.263947,1.896588,-1.834347,1.270358,...,0.223054,-0.081841,0.406155,-0.120125,0.378657,0.108500,0.147135,-0.127671,0.029986,-0.069074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364042,364042,460,2015-06-11 14:48:38,0,144,1.248788,5.252166,-0.650825,0.839686,-0.265040,...,0.333667,0.150682,0.122387,-0.058705,-0.021133,-0.036125,-0.033009,-0.024431,0.227345,0.006480
364043,364043,460,2015-06-12 22:34:32,0,463,0.575187,2.392510,-2.702566,5.805795,2.654181,...,0.062002,-0.290488,0.152976,-0.456322,-0.133840,0.073555,0.351299,0.884408,0.886902,0.348739
364044,364044,460,2016-03-14 16:51:19,0,177,1.564129,4.053350,2.134530,1.152976,0.980015,...,-0.112697,0.208594,0.007686,0.212485,-0.333949,0.072767,-0.048202,-0.142374,-0.120078,0.043033
364045,364045,460,2018-01-14 21:18:57,0,126,4.866370,0.151831,-0.424443,-0.442647,-1.083554,...,-0.161944,-0.021396,-0.139878,-0.014094,0.206057,-0.069097,0.348434,0.216037,0.097441,0.232524


In [None]:
clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(glob.glob(clicks_folder+"/clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = clicks.astype(
    {"session_start": "datetime64[ns]", "click_timestamp": "datetime64[ns]"}
)

clicks.describe(include="all", datetime_is_numeric=True)

100%|██████████| 385/385 [01:16<00:00,  5.05it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 14:17:08.013157120,3.901885,,2017-10-08 14:51:05.106516736,,,,,,
min,,,2017-10-01 02:37:03,2.0,,2017-10-01 03:00:00,,,,,,
25%,,,2017-10-04 13:35:52,2.0,,2017-10-04 14:20:52,,,,,,
50%,,,2017-10-08 20:09:00,3.0,,2017-10-08 20:35:30,,,,,,
75%,,,2017-10-11 19:16:54,4.0,,2017-10-11 19:43:24,,,,,,
max,,,2017-10-17 03:36:19,124.0,,2017-11-13 20:04:14,,,,,,


In [None]:
clicks = clicks[['user_id', 'session_id', 'session_size', 'click_article_id']]
clicks.head(3)

Unnamed: 0,user_id,session_id,session_size,click_article_id
0,0,1506825423271737,2,157541
1,0,1506825423271737,2,68866
2,1,1506825426267738,2,235840


In [None]:
users_df = clicks.groupby('user_id').agg({'click_article_id':lambda x: list(x)}).reset_index()
users_df.head(3)

Unnamed: 0,user_id,click_article_id
0,0,"[157541, 68866, 96755, 313996, 160158, 233470,..."
1,1,"[235840, 96663, 59758, 160474, 285719, 156723,..."
2,10,"[198420, 198322, 202436, 207731, 206402, 30047..."


In [None]:
# Sauvegarde des fichiers pour le stockage Azure Blob
data_azure = 'data_azure'

list_dir = os.listdir(ROOT_DIR+'data/')

if data_azure not in list_dir:
    os.mkdir(ROOT_DIR+'data/'+ data_azure)

articles.to_pickle(ROOT_DIR+'data/' + data_azure + '/articles_embeddings.pickle')
clicks.to_pickle(ROOT_DIR+'data/' + data_azure + '/all_clicks.pickle')
users_df.to_pickle(ROOT_DIR+'data/' + data_azure + '/users.pickle')

## Content-Based Recommender model



In [None]:
def aggregate_articles(articles):
    return articles.groupby(lambda x: True).agg(
        {
            col: "mean"
            if is_numeric_dtype(articles.dtypes[col])
            else lambda x: x.mode()[0]
            for col in articles.columns
        }
    )

def get_user_interest(user_id, clicks):
    user_id = str(user_id)
    user_interests = clicks.query("user_id == @user_id")["click_article_id"]
    return list(user_interests)

def prepare_for_scale(articles):
    articles_copy = articles.drop(["article_id", "similarity","created_at_ts"], axis=1, errors="ignore")
    return articles_copy

def get_closest_articles(user_interests_ids, articles, n=5):
    user_profile = aggregate_articles(articles.query("article_id in @user_interests_ids")).drop(["article_id"], axis=1)
    articles = articles[~articles["article_id"].isin(user_interests_ids)]
    scaler = StandardScaler()
    articles_std = scaler.fit_transform(prepare_for_scale(articles))
    interest_std = scaler.transform(prepare_for_scale(user_profile))

    articles = articles.copy()
    articles["similarity"] = cosine_similarity(interest_std, articles_std)[0]

    return (
        articles.sort_values("similarity", ascending=False).iloc[:n],
        scaler,
        articles_std,
        interest_std,
    )


In [None]:
ca = pd.DataFrame()
user_id_list = clicks.user_id.unique().tolist()
for uid in  tqdm(user_id_list) :
    user_id = str(uid)
    #print(user_id)
    user_interests_ids = get_user_interest(user_id, clicks)
    closest_articles, scaler, articles_std, interest_std = get_closest_articles(
        user_interests_ids, articles)
    closest_articles["user_id"] = user_id
    ca = pd.concat([ca, closest_articles[["user_id","article_id","category_id"]]], axis=0)
    if (ca.shape[0]==10000):
      data_azure = "data_azure"
      ca.to_pickle(ROOT_DIR+'data/' + data_azure +"/prediction_content_based.pickle")

data_azure = "data_azure"
ca.to_pickle(ROOT_DIR+'data/' + data_azure +"/prediction_content_based.pickle")

ca

  4%|▍         | 14227/322897 [23:51:27<520:34:13,  6.07s/it]

In [None]:
data_azure = "data_azure"
ca.to_parquet(ROOT_DIR+'data/' + data_azure +"/prediction_content_based.gzip", compression="gzip", engine="pyarrow")

In [None]:
data_azure = "data_azure"
ca = pd.read_parquet(ROOT_DIR+'data/' + data_azure +"/prediction_content_based.gzip")

In [None]:
ca.to_csv(ROOT_DIR+'data/' + data_azure +"/prediction_content_based.csv", index=False)

In [None]:
pca = PCA(n_components=2)
articles_sample_std = scaler.transform(prepare_for_scale(articles))
closest_articles_std = scaler.transform(
    prepare_for_scale(closest_articles)
)
articles_pca = pca.fit_transform(articles_sample_std)
interest_pca = pca.transform(interest_std)
closest_articles_pca = pca.transform(closest_articles_std)


# Plot the data in the PCA space
fig = px.scatter(
    x=articles_pca[:, 0],
    y=articles_pca[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="PCA 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_pca[:, 0],
    y=interest_pca[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_pca[:, 0],
    y=closest_articles_pca[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_pca))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()


In [None]:
tsne = TSNE(n_components=2)
articles_tsne = tsne.fit_transform(
    np.concatenate((articles_sample_std, closest_articles_std, interest_std))
)

interest_tsne = articles_tsne[-1:]
articles_tsne = articles_tsne[:-1]

closest_articles_tsne = articles_tsne[-len(closest_articles) :]
articles_tsne = articles_tsne[: -len(closest_articles)]


# Plot the data in the t-SNE space
fig = px.scatter(
    x=articles_tsne[:, 0],
    y=articles_tsne[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="t-SNE 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_tsne[:, 0],
    y=interest_tsne[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_tsne[:, 0],
    y=closest_articles_tsne[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_tsne))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [None]:
users_last_click = (
    clicks.reset_index()
    .rename(columns={"index": "click_id"})
    .sort_values(by="click_timestamp")
    .groupby(["user_id"])
    .last()
)

X = clicks.drop(list(users_last_click["click_id"]))
y_true = dict(users_last_click["click_article_id"])

test_sample = random.sample(list(y_true.keys()), k=100)


In [None]:


y_pred_last_click = {
    user_id: list(
        get_closest_articles(
            get_user_interest(user_id, X, articles, strategy="last_click"),
            articles,
            n=1000,
        )[0]["article_id"]
    )
    for user_id in tqdm(test_sample)
}



     



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:24<41:07, 24.93s/it][A
  2%|▏         | 2/100 [00:43<35:05, 21.48s/it][A
  3%|▎         | 3/100 [01:02<32:52, 20.34s/it][A
  4%|▍         | 4/100 [01:23<32:33, 20.35s/it][A
  5%|▌         | 5/100 [01:42<31:22, 19.82s/it][A
  6%|▌         | 6/100 [02:01<30:49, 19.68s/it][A
  7%|▋         | 7/100 [02:21<30:33, 19.71s/it][A
  8%|▊         | 8/100 [02:41<30:24, 19.83s/it][A
  9%|▉         | 9/100 [03:00<29:34, 19.50s/it][A
 10%|█         | 10/100 [03:18<28:51, 19.24s/it][A
 11%|█         | 11/100 [03:37<28:25, 19.16s/it][A
 12%|█▏        | 12/100 [03:56<27:55, 19.04s/it][A
 13%|█▎        | 13/100 [04:15<27:34, 19.01s/it][A
 14%|█▍        | 14/100 [04:34<27:08, 18.93s/it][A
 15%|█▌        | 15/100 [04:53<26:48, 18.92s/it][A
 16%|█▌        | 16/100 [05:12<26:47, 19.13s/it][A
 17%|█▋        | 17/100 [05:32<26:42, 19.30s/it][A
 18%|█▊        | 18/100 [05:51<26:06, 19.10s/it][A
 19%|█▉        | 19/100 [06:1