In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#!pip install progressbar
#import progressbar

import os

from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate

from sklearn.metrics.pairwise import linear_kernel

# L'etude de dataset

### Articles_metadata
* article_id : unique id for the article
* category_id : the category of the article, there is 461 category.
* created_at_ts : timestamp when the article as been created. You should remove the 3 last numbers. Oldest article : 2006/09/27 11:14:35. Newest : 2018/3/13 12:12:30.
* publisher_id : seems empty. only 0
* words_count : how many words in the article. From 0 to 6690. Distribution is available in few cells.

In [3]:
articles_metadata = pd.read_csv('../input/news-portal-user-interactions-by-globocom/articles_metadata.csv')  
articles_metadata

In [None]:
#articles_metadata.words_count.hist(bins=50, range=(0,400))
#plt.title('How many words per article');

### Clicks_hours meaning (according to me)
* user_id : unique id for the user.
* session_id : unique id for an user's session. Can appear multiple times because multiple clicks per session.
* session_start : timestamp of the session. You must remove the 3 last numbers. Oldest session : 2017/10/01 2:37:3. Newest session : 2017/10/1 3:39:19
* session_size : how many clicks per session. From 2 to 24. Distribution available few cells below.
* click_article_id : the article that has been cliked on.
* click_environment : no idea, maybe from website/app ? There is only three types, and the type 4 takes more than 90%. 
* click_deviceGroup : must be the kind of device : computer, smartphone or tablet ?
* click_os : the operation system of the device.
* click_country : which country the user come from.
* click_region : same but for region.
* click_referrer_type : good question.


In [4]:
clicks_hour_010 = pd.read_csv('../input/news-portal-user-interactions-by-globocom/clicks/clicks/clicks_hour_010.csv')  
clicks_hour_010

In [5]:
clicks_hour_010.session_size.hist(bins=24, range=(0,10))
plt.title('How many clicks per session');

In [6]:
clicks_hour_010.loc[clicks_hour_010.user_id == 0]

In [7]:
clicks_hour_010.loc[clicks_hour_010.session_id == 1506864979264937]

# Methodes
We are going to try 2 differents way to predict articles to the user.

* Baseline : Just like this video (https://www.youtube.com/watch?v=YMZmLx-AUvY), we are just going to recreate a matrix for the article and the user preferenced only based on previous clicks.
* Collaborative Filtering : We are going to use the surprise librairy. https://github.com/NicolasHug/Surprise
* Content-based Filtering : We are going to use word embedding.
* Hybrid methods

# Content-based : by category
## Traitement de données
Concate all clicks in one file

In [8]:
%%time

if not os.path.exists('clicks.csv'):
    clicks_path = []
    clicks_dir = "../input/news-portal-user-interactions-by-globocom/clicks/clicks"

    clicks_path = clicks_path + sorted(
            [
                os.path.join(clicks_dir, fname)
                for fname in os.listdir(clicks_dir)
                if fname.endswith(".csv")
            ]
        )
    print("Number of clicks csv:", len(clicks_path))

    _li = []

    for filename in clicks_path:
        df = pd.read_csv(filename, index_col=None, header=0)
        _li.append(df)

    clicks = pd.concat(_li, axis=0, ignore_index=True)
    clicks.to_csv('clicks.csv')
else:
    clicks= pd.read_csv('clicks.csv')
    
clicks

We need to group every article click per user.

In [9]:
df = clicks.groupby('user_id').agg(
    LIST_click_article_id = ('click_article_id', lambda x: list(x)),
)
df

In [11]:
%%time
#Code need optimization

if not os.path.exists('../input/p9-data/df.csv'):
    #pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()], maxval=len(df)).start()
    
    df['categories'] = ''

    for index, row in df.iterrows():
       # pbar.update(index)
        _list_row = []
        for article in row.LIST_click_article_id:
            _list_row.append(articles_metadata[articles_metadata.article_id == article].category_id.values[0])
        df.loc[index]['categories']=_list_row
    df.to_csv('df.csv')
   # pbar.finish()
else:
    df= pd.read_csv('../input/p9-data/df.csv')

In [12]:
df

Now we have article_id and category_id that users clicked on.

## Calculate clicks per user
How many time an user clicked on a article from X category_id.

In [14]:
def inputUserRatings(userId):
    _matrix = pd.DataFrame(columns=['click'])
    _row = df.loc[userId]['categories']
    _row = _row.replace('[', '').replace(']', '').replace(',', '').split()
    
    for index, val in pd.Series(_row).value_counts().items():
        _matrix.loc[index] = int(val)
         
    _matrix['click_norm'] = _matrix.apply(lambda x : x / _matrix['click'].max())
    _matrix = _matrix.reset_index()
    _matrix = _matrix.rename(columns={"index": "category_id"})
    _matrix['category_id'] = _matrix['category_id'].astype(int)
    
    return _matrix

In [15]:
inputUserRatings(0)

## Recommend books from the 360 000 books available based on prefered category_id.

In [17]:
articles_matrix = articles_metadata.loc[:, ['article_id', 'category_id']]
display(articles_matrix.dtypes)

In [22]:
def recommend5(userId, articles_matrix):
    _input_user_ratings = inputUserRatings(userId)
    _articles_matrix = articles_matrix
    
    _weighed_articles_matrix = pd.DataFrame(columns=['article_id', 'category_id', 'weight'])
    _weighed_articles_matrix['article_id'] = _articles_matrix['article_id']
    _weighed_articles_matrix['category_id'] = _articles_matrix['category_id']
    #display(_weighed_articles_matrix.dtypes)

    _weighed_articles_matrix['weight'] = _weighed_articles_matrix['category_id'].apply(lambda x :
                                                                                        0                                                                                           
                                                                                       if(_input_user_ratings.loc[_input_user_ratings['category_id'] == x].empty == True)
                                                                                       else
                                                                                        _input_user_ratings.loc[_input_user_ratings['category_id'] == x].click_norm.values[0]
                                                                                      )

    
    return _weighed_articles_matrix[_weighed_articles_matrix.weight > 0].sort_values(by=['weight'], ascending=False)

In [28]:
%%time
res = recommend5(322892 , articles_matrix)
res

//lent algo%%time
res = recommend5(322892 , articles_matrix)
res

In [24]:
%%time
res1 = recommend5(3 , articles_matrix)
res1

# Collaborative filtering
Let's try with using the surprise librairy.
https://medium.com/hacktive-devs/recommender-system-made-easy-with-scikit-surprise-569cbb689824
## Prepare data

In [29]:
clicks.head()

In [30]:
articles_metadata.head()

In [31]:
%%time
dataframe = clicks.merge(articles_metadata, left_on='click_article_id', right_on='article_id')

In [32]:
dataframe = dataframe[['user_id', 'article_id', 'category_id']]
dataframe

In [39]:
%%time
temp = dataframe.groupby(['user_id', 'category_id']).size()
user_rating_matrix = temp.to_frame()
user_rating_matrix = user_rating_matrix.reset_index()
user_rating_matrix.rename(columns = {0:'rate'}, inplace = True)

In [40]:
user_rating_matrix

 here it's a dataframe with every user.

In [41]:
reader = Reader(rating_scale=(1,10))
_x = user_rating_matrix.loc[user_rating_matrix.rate > 1]
data = Dataset.load_from_df(_x[['user_id', 'category_id', 'rate']], reader)

print('We have selects', len(_x), 'interactions.')

## Regarder pour plus tard : surprise librairy

In [37]:
%%time
trainset, testset = train_test_split(data, test_size=0.25)
print('Test set lenght :', len(testset))
print('Train set lenght :', len(_x) - len(testset))

In [38]:
%%time
from surprise import SVD, accuracy
algo = SVD()
algo.fit(trainset)

In [None]:
%%time
predictions = algo.test(testset)
print('Number of predictions in Test set :', len(predictions))

In [None]:
%%time
accuracy.rmse(predictions)

In [None]:
from collections import defaultdict


def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n