In [None]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import os

from time import time
from random import randint

# Préparation des données 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:


src_path = Path('/content/drive/MyDrive/Colab_Notebooks/fichiers')

# Path to the clicks files
click_path = Path('/content/drive/MyDrive/Colab_Notebooks/clicks')


def load_data(src_path, click_path):
    '''
    Function used to load data from the "News Portal" site of Globo.com

     Parameters:
         src_path: path to the folder containing the metadata and the embedding matrix of the articles
         click_path: path to the folder containing user interaction files (1 file per hour)
    
     Returns:
         articles (dataframe): article metadata (id, category, publication date, number of words)
         embeddings (array): matrix of 250 vectors of the words contained in the articles
         clicks (dataframe): user interactions or clicks with articles
    '''
    # Load articles' metadata
    articles = pd.read_csv(src_path / 'articles_metadata.csv')
    # Drop useless feature
    articles.drop(columns=['publisher_id'], inplace=True)
    # Convert all data types to integer
    articles = articles.astype(np.int64)

    # Load articles' embedding
    embeddings = pd.read_pickle(src_path / 'articles_embeddings.pickle')
    # Change data type from float64 to float32
    embeddings = embeddings.astype(np.float32)

    # Load user interactions with articles
    clicks = pd.DataFrame().append([pd.read_csv(click_path / file) for file in sorted(os.listdir(click_path))],ignore_index=True)
    # Rename columns
    clicks.rename(columns={'click_article_id':'article_id'}, inplace=True)
    # Drop useless feature
    clicks.drop(columns=['click_environment', 'click_deviceGroup', 'click_os', 'click_country', 'click_region', 'click_referrer_type'], inplace=True)
    # Convert all data types to integer
    clicks = clicks.astype(np.int64)

    return articles, embeddings, clicks

In [None]:
# Call the function to load data
articles, embeddings, clicks = load_data(src_path, click_path)

# Display shape
print('Articles Dataframe shape: ', articles.shape)
print('Embedding Matrix shape: ', embeddings.shape)
print('Clicks Dataframe shape: ', clicks.shape)

Articles Dataframe shape:  (364047, 4)
Embedding Matrix shape:  (364047, 250)
Clicks Dataframe shape:  (2988181, 6)


sauvgarder clicks.csv

In [None]:
clicks.to_csv('/content/drive/MyDrive/Colab_Notebooks/clicks.csv', index=False)

# 1) modèle non personnalisé basé sur la popularité de l'article 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Create popularity model
def get_popularity_rec(clicks, n_reco=5):
    # Compute the most popular articles
    df_popularity = clicks.groupby(by=['article_id'])['click_timestamp'].count().sort_values(ascending=False).reset_index()
    df_popularity.rename(columns = {'user_id':'popularity'}, inplace=True)
    return df_popularity.article_id.head(n_reco).to_list()

In [None]:
# Call the function
get_popularity_rec(clicks, n_reco=5)

[160974, 272143, 336221, 234698, 123909]

In [None]:
def get_cb_reco(userID, clicks, embeddings, n_reco=5):
    '''Return 5 recommended articles ID to user'''

    # Print targetted UserID
    print('User ID is : ', userID)

    # Get the list of articles viewed by the user
    var = clicks.loc[clicks.user_id == userID]['article_id'].to_list()
    
    # Select the last element of the list (most recent one)
    value = var[-1]
    print('The last article read by the user is: ', value)
    
    # Delete already viewed articles except the selected one
    emb = embeddings
    for i in range (0, len(var)):
        if i != value:
            emb = np.delete(emb, [i], 0)
    
    # Delete selected article from the new matrix
    temp = np.delete(emb, [value], 0)

    # Get n_reco articles which are the most similar to the selected one
    distances = cosine_similarity([emb[value]], temp)[0]
    
    # Find the indexes, except the selected article
    ranked_ids = np.argsort(distances)[::-1][0:n_reco]
    ranked_similarities = np.sort(distances)[::-1][0:n_reco]
    print('Recommended articles are: ')
    
    return ranked_ids.tolist()# , ranked_similarities.tolist()

In [None]:
# Call the function
start = time()

userID = 0
reco = get_cb_reco(userID, clicks, embeddings, n_reco=5)
print(reco)

print(f'Model execution time : {round(time() - start, 2)}s')

User ID is :  0
The last article read by the user is:  87205
Recommended articles are: 
[102720, 100020, 102412, 102611, 86703]
Model execution time : 1.61s


# 2) Modèle personnalisé

### a) Modèle basé sur la similarité entre les articles

In [None]:
matrix_articles_embeddings= embeddings
ids_article_read = train_df['article_id'].unique()
ids_article_read.sort()

ids_article_read = ids_article_read.tolist()

matrix_articles_read_embeddings = np.zeros(shape=(len(ids_article_read), matrix_articles_embeddings.shape[1]), dtype='float32')
for i, j in enumerate(ids_article_read):    
    matrix_articles_read_embeddings[i] = matrix_articles_embeddings[j]

In [None]:
def get_articles_already_read_for_user(user_id):
    articles_already_read = train_df[train_df['user_id']==user_id]['article_id'].unique().tolist()
    return articles_already_read

def get_articles_cosine_similarities_matrix(matrix_articles_embeddings):
    cosine_similarities_matrix = cosine_similarity(matrix_articles_embeddings, matrix_articles_embeddings)
    return cosine_similarities_matrix

cosine_similarities_matrix = get_articles_cosine_similarities_matrix(matrix_articles_read_embeddings)


def get_similar_articles(cos_sim_matrix, article_id):
    
    ids = []
    scores = []
    article_index_id = ids_article_read.index(article_id)
    for i in range(len(cos_sim_matrix)):
        ids.append(ids_article_read[i])
        scores.append(cos_sim_matrix[i][article_index_id])

    df_ids_scores = pd.DataFrame(list(zip(ids, scores)), columns=['id', 'score'])    
    df_ids_scores_sorted = df_ids_scores.sort_values(by=['score'], axis=0, ascending=False)
    similar_articles = df_ids_scores_sorted['id'].to_list()
    
    return similar_articles

def get_last_article_read(user_id):
    last_article_id = int(train_df[train_df['user_id']==user_id]['article_id'].iloc[-1])
    return last_article_id



def get_recommendations_articles_similarities(user_id, top_n, matrix_cos_sim):
    if user_id in train_df['user_id'].values:
        last_article_id = get_last_article_read(user_id)
        similar_articles = get_similar_articles(matrix_cos_sim, last_article_id)
        articles_already_read_for_user = get_articles_already_read_for_user(user_id)
        similar_articles_not_already_read = [i for i in similar_articles if i not in set(articles_already_read_for_user)]
        top_n_similar_articles_not_already_read = similar_articles_not_already_read[:top_n]
        
        return top_n_similar_articles_not_already_read
    else:
        print("Error : User does not exist or user with not enough historic")


get_recommendations_articles_similarities(3988, 5, cosine_similarities_matrix)

[292543, 292811, 293292, 292446, 293244]

Unnamed: 0,user_id,article_id,rating
95116,3988,156910,0.008403
2102735,150121,124749,0.100000
1391833,80707,202383,0.166667
739221,40852,140720,0.125000
379651,17834,240233,0.026316
...,...,...,...
982533,53805,57616,0.028571
1500328,88819,289003,0.023256
2039713,142592,129029,0.016667
2107730,150764,158541,0.011494


## b) filtrage collaboratif
   
Le principe de recommandation par filtrage de contenu consiste à construire un profil pour chaque utilisateur ainsi que pour chaque item, à l'aide de mots-clés. Il s'agira ensuite d'associer à un utilisateur les items qui lui correspondent le mieux, en se référent à ses préférences (son profil) ainsi qu'à son historique 

préparation des données : articles aimmé par l'utilisateur 

In [None]:
def get_ratings(clicks):
    ### Compute the rating dataframe providing for each interaction a rating based on the number of clicks per article weighted by the total number of clicks per user
    
    # Create a dataframe containing the number of clicks for each user and each article    
    count_clicks_by_articles_by_user = clicks.groupby(["user_id", "article_id"]).agg(count_clicks_by_articles_by_user=("session_id", "count"))
    
    # Create a dataframe containing the number of clicks for each user   
    count_clicks_by_user = clicks.groupby(["user_id"]).agg(count_clicks_by_user=("session_id", "count"))
    
    # Compute the weighted ratio of clicks
    clicks_count = count_clicks_by_articles_by_user.join(count_clicks_by_user, on="user_id")
    clicks_count['rating'] = clicks_count["count_clicks_by_articles_by_user"] / clicks_count["count_clicks_by_user"]
    
    # Just rename columns
    ratings = clicks_count.reset_index().drop(["count_clicks_by_articles_by_user","count_clicks_by_user"], axis=1).rename(columns={"click_article_id":"article_id"})
    
    return ratings

In [None]:
ratings = get_ratings(clicks)
ratings

Unnamed: 0,user_id,article_id,rating
0,0,68866,0.125
1,0,87205,0.125
2,0,87224,0.125
3,0,96755,0.125
4,0,157541,0.125
...,...,...,...
2950705,322894,168401,0.500
2950706,322895,63746,0.500
2950707,322895,289197,0.500
2950708,322896,30760,0.500


In [None]:
#Sampling
SAMPLE_RATIO = 0.05

ratings_sample = ratings.sample(frac=SAMPLE_RATIO, random_state=8989)
ratings_sample

Unnamed: 0,user_id,article_id,rating
2936274,317101,199393,0.500000
1620248,99395,59758,0.500000
612201,31691,293050,0.016393
1525674,90863,32750,0.076923
1100500,60312,236524,0.050000
...,...,...,...
414904,19887,284463,0.250000
567504,28569,96877,0.040000
551496,27480,114095,0.142857
1078537,59049,199198,0.016393


### séparation des données test et d'entraibnment et Entrainement du modèle 

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(ratings_sample, train_size=0.8, random_state=31)

In [None]:
!pip install implicit==0.6.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit==0.6.1
  Downloading implicit-0.6.1-cp38-cp38-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.1


### Nous allons tester 3 modèles :


 - Logistic Matrix Factorization : modèle probabiliste pour la factorisation matricielle avec rétroaction implicite ( analyser les relations entre les utilisateurs et les éléments à l'aide de signaux implicites tels que les données de clic ou la lecture en continu de musique compte pour fournir aux utilisateurs des recommandations personnalisées ) 
 
 
 - AlternatingLeastSquares
 - BayesianPersonalizedRanking

In [None]:
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

from tqdm import tqdm

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, ndcg_at_k, AUC_at_k



In [None]:
def train_models(train_df, test_df, models_list, n_recommandations):
    
    '''
    Train recommendation models and provide evaluation metrics.

            Parameters:
                    train_df (Pandas Dataframe): Dataframe containing training interactions datas
                    test_df (Pandas Dataframe): Dataframe containing test interactions datas
                    models_list (List) : Python List containing models to be evaluated
                    n_recommandations (Int) : Integer representing the number of recommandations to provide


            Returns:
                    df_results (Pandas Dataframe): Dataframe containing the results of evaluation performed on the models
    '''
    
    # First, let's initialize a blank results dataframe 
    # Each column correspond to the evaluation metrics to use
    df_results = pd.DataFrame(columns=['model', 'Precision@k','MAP@k','nDCG@k',"train_time"])
    
    # Get dimensions of the sparse matrix    
    dim = (max(train_df.user_id.max(),test_df.user_id.max())+1, max(train_df.article_id.max(),test_df.article_id.max())+1)
    
    # Creating sparse matrix for train and test dataframe
    # Why creating sparse matrix ? : because it is the data format expected by Implicit
    train_csr = csr_matrix((train_df['rating'], (train_df['user_id'], train_df['article_id'])), dim)
    test_csr = csr_matrix((test_df['rating'], (test_df['user_id'], test_df['article_id'])), dim)
    
    # Iterate over each models to evaluate in the list
    for model in models_list:
        
        print("##"*30)
        print("[INFO] : Start training the model : ", model.__class__.__name__)
        
        # Launch the timer
        train_start_time = time()
        
        # Train the choosen model
        model.fit(train_csr)
        #model.save('/content/drive/MyDrive/Colab_Notebooks/')
        # Stop the timer and calculate the training time
        train_time = time() - train_start_time
              
        # Calculate evaluation metrics        
        precision_k = round(precision_at_k(model, train_csr, test_csr), 5)
        map_at_k = round(mean_average_precision_at_k(model, train_csr, test_csr), 5)
        #Gain cumulé actualisé normalisé 
        ndcg_k = round(ndcg_at_k(model, train_csr, test_csr), 5)
        print("[INFO] : Precision@k = ", precision_k)
        print("[INFO] : MAP@k = ", map_at_k)
        print("[INFO] : nDCG@k = ", ndcg_k)
        print("##"*30)
        
        # Log results in the results dataframe
        
        df_results = df_results.append({
            'model': model.__class__.__name__,
            'Precision@k': precision_k,
            'MAP@k': map_at_k,
            'nDCG@k': ndcg_k,
            'train_time': round(train_time,5),
        }, ignore_index=True)
        
    return df_results

In [None]:
models_list = [AlternatingLeastSquares(), BayesianPersonalizedRanking(), LogisticMatrixFactorization()]

train_models(train_df          = train_df,
             test_df           = test_df, 
             models_list       = models_list, 
             n_recommandations = 5)



############################################################
[INFO] : Start training the model :  AlternatingLeastSquares


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

[INFO] : Precision@k =  0.02006
[INFO] : MAP@k =  0.00672
[INFO] : nDCG@k =  0.00999
############################################################
############################################################
[INFO] : Start training the model :  BayesianPersonalizedRanking


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

[INFO] : Precision@k =  0.00346
[INFO] : MAP@k =  0.00094
[INFO] : nDCG@k =  0.00153
############################################################
############################################################
[INFO] : Start training the model :  LogisticMatrixFactorization


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

[INFO] : Precision@k =  0.06945
[INFO] : MAP@k =  0.02254
[INFO] : nDCG@k =  0.03456
############################################################


Unnamed: 0,model,Precision@k,MAP@k,nDCG@k,train_time
0,AlternatingLeastSquares,0.02006,0.00672,0.00999,89.67822
1,BayesianPersonalizedRanking,0.00346,0.00094,0.00153,7.38844
2,LogisticMatrixFactorization,0.06945,0.02254,0.03456,13.23848


In [None]:
models_list = [ LogisticMatrixFactorization()]

train_models(train_df          = train_df,
             test_df           = test_df, 
             models_list       = models_list, 
             n_recommandations = 5)

############################################################
[INFO] : Start training the model :  LogisticMatrixFactorization


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

  0%|          | 0/25966 [00:00<?, ?it/s]

[INFO] : Precision@k =  0.06162
[INFO] : MAP@k =  0.02188
[INFO] : nDCG@k =  0.03209
############################################################


Unnamed: 0,model,Precision@k,MAP@k,nDCG@k,train_time
0,LogisticMatrixFactorization,0.06162,0.02188,0.03209,14.66109


sauvgarde de la fonction de recommandation 

In [None]:
import pickle

def compute_interaction_matrix(clicks):
    # Create interaction DF (count of interactions between users and articles)
    interactions = clicks.groupby(['user_id','article_id']).size().reset_index(name='count')
    # print('Interactions DF shape: ', interactions.shape)

    # csr = compressed sparse row (good format for math operations with row slicing)
    # Create sparse matrix of shape (number_items, number_user)
    csr_item_user = csr_matrix((interactions['count'].astype(float),
                                (interactions['article_id'],
                                 interactions['user_id'])))
    # print('CSR Shape (number_items, number_user): ', csr_item_user.shape)
    
    # Create sparse matrix of shape (number_user, number_items)
    csr_user_item = csr_matrix((interactions['count'].astype(float),
                                (interactions['user_id'],
                                 interactions['article_id'])))
    # print('CSR Shape (number_user, number_items): ', csr_user_item.shape)
    
    return csr_item_user, csr_user_item

def get_cf_reco(clicks, userID, csr_item_user, csr_user_item, model_path=None, n_reco=5, train=True):

    start = time()
    # Train the model on sparse matrix of shape (number_items, number_user)
    
    if train or model_path is None:
        model = LogisticMatrixFactorization(factors= 128, random_state=42)
        print("[INFO] : Start training model")
        model.fit(csr_user_item)

        # Save model to disk
        with open('recommender.model', 'wb') as filehandle:
            pickle.dump(model, filehandle)
    else:
        with open('recommender.model', 'rb') as filehandle:
            model = pickle.load(filehandle)

    # Recommend N best items from sparse matrix of shape (number_user, number_items)
    # Implicit built-in method
    # N (int) : number of results to return
    # filter_already_liked_items (bool) : if true, don't return items present in 
    # the training set that were rated/viewd by the specified user
    recommendations_list = []
    recommendations = model.recommend(userID, csr_user_item[userID], N=n_reco, filter_already_liked_items=True)

    print(f'[INFO] : Completed in {round(time() - start, 2)}s')
    
    recommendations = [elt[0] for elt in recommendations]
    
    return recommendations

In [None]:
# Call the function, train the model and perform recommandations
userID = 61691
csr_item_user, csr_user_item = compute_interaction_matrix(clicks)
get_cf_reco(clicks, userID, csr_item_user, csr_user_item,model_path=None, n_reco=5, train=True)

[INFO] : Start training model


  0%|          | 0/30 [00:00<?, ?it/s]

[INFO] : Completed in 451.43s


[284312, 6.371255]

In [None]:
# Call the function, load the model and perform recommandations

get_cf_reco(clicks, userID, csr_item_user, csr_user_item, model_path="./recommender.model", n_reco=5, train=False)

[INFO] : Completed in 0.07s


[284312, 6.371255]

In [None]:
#!pip install contractions
!pip install azure-cosmos