## Imports

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import os
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import rankdata


## Import data

In [None]:
users = pd.read_csv('article_user.csv')
articles = pd.read_csv('articles.csv')

In [None]:
users

In [None]:
articles

## Explore Data

In [None]:
px.histogram(users.user_id.value_counts())

11k users have read 20 articles, The rest of the users 
We may have good predcitions for the 20artilce users users but not for the 3article users users with the simple content based approach

In [None]:
px.histogram((users.timestamp - users.timestamp.min())/(60*60*24), title='Time histogram resampled to hourly')

No data gaps in the timeline of the data - Can use all data as training set without concerns

N of articles has a periodic daily behavior. Probably users read less news at night?

In [None]:
users.duplicated().any()

In [None]:
users.drop_duplicates(inplace=True)

Remove duplicated data in users

In [None]:
users.isna().sum().sum()

No Nans in the data and duplicates dropped

In [None]:
list(users.article_id.unique()) == list(articles.article_id)

I have a headline for every article in the users table

In [None]:
px.histogram(pd.to_datetime(articles['published_date'], format='%Y-%m-%d %H:%M:%S'))

Most users are from 2021, in first approximation I can recommend any article on this set

Extremely likely the recommended article will be a new article

## Recommending Options

###  Content based recommendation

The only feature I have to describe an article is the headline 
<br>
I can suggest an articles that is similar to the articles the user have read in the past
<br>
I need a metric to indicate how similar 2 articles are: I will use a vector representation for the headline and a similarity metric for two vectors
<br>
The 2 vector approaches to explore are TfIdf and Bert

## To delete

In [None]:
proxy = "http://proxy-chain.intel.com:911"

In [None]:
os.environ['http_proxy'] = proxy 
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy

## Implementation of TfIdf and Bert models

In [None]:
class Recommender():
    def __init__(self, articles):
        self.articles = articles
        self.cosine_sim = None
        self.top5 = None
    def train_tfidf(self, train):
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(train.headline)
        self.cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    def train_bert(self, train):
        model = SentenceTransformer('bert-base-nli-mean-tokens')
        sentence_embeddings = model.encode(train.headline)
        self.cosine_sim =  cosine_similarity(sentence_embeddings,sentence_embeddings)
    def load_top5_articles(self, users):
        top5_id= users.article_id.value_counts()[0:5]
        top5_art =pd.merge(top5_id,self.articles,left_index=True,right_on ='article_id')
        self.top5 = top5_art['headline']
    def recommend_top5(self, user):
        """Alwasys recommends the top5 articles across all data"""
        return self.top5
    def recommend(self, user ,users):
        """Recommend articles for user"""
        prev_articles = self.prev_articlesid(user,users)
        similar = self.similar5( prev_articles)
        return self.id2headline(similar)
    def prev_articlesid(self, user,users):
        """all previous articles for user in users df"""
        return  users[users.user_id==user]['article_id']
    def similar_vector(self, prev_articles):
        """vector of lenght number of headlines representing how similar a headine is to previous articles"""
        idxs = self.articles[self.articles['article_id'].isin(prev_articles)  ].index
        vectors = self.cosine_sim[idxs]
        vector = vectors.mean(axis=0) # mean across all previous articles, in a next level model most recent articles can have more weight
        return vector
    def similar5(self,prev_articles ):
        vector = self.similar_vector(prev_articles)
        ranks = len(self.articles)- rankdata(vector) # inverting  so rank 0 is the first recommendation
        idxs = ranks.argsort()[:5]
        return articles.iloc[idxs]['article_id']
        
    def id2headline(self, article_ids):
        return self.articles[self.articles['article_id'].isin(article_ids)]['headline']
    def rank_article(self, user,users,  articleid):
        """Returns the rank in the recommendation system for the article"""
        prev_articles = self.prev_articlesid( user,users)
        vector = self.similar_vector( prev_articles)
        ranks = len(self.articles)- rankdata(vector) # inverting  so rank 0 is the first recommendation
        idx = self.articles[self.articles.article_id == articleid ].index[0]
        return ranks[idx]


The class recommender provides methods for recommendations using two models, they both use vector representations for the headlines 


## Divide the users dataset into test and train
Test is the composed of the last article for every user and train is the rest of the data

In [None]:
test = users.loc[users.groupby("user_id")["timestamp"].idxmin()]
train = pd.concat([users, test]).drop_duplicates(keep=False)
test.reset_index(drop=True,inplace=True)
train.reset_index(drop=True,inplace=True)

In [None]:
len(train) +len(test) , len(users)

Trainning the system with the headlines and the tfidf model

In [None]:
recommender = Recommender(articles)

In [None]:
%%time
recommender.train_tfidf(articles)

In [None]:
recommender.cosine_sim.shape, len(articles)

This the similarity matrix for N of headlines 

In [None]:
user = test.user_id.iloc[0]
user

## Recommender method that suggest top 5 articles for a given user with the tdidf model

In [None]:
recommender.recommend(user,test)

## Recommender method that suggest top 5 articles for a given user with the bert model

In [None]:
%%time
recommender.train_bert(articles)

In [None]:
recommender.recommend(user,test)

## Recommender method that suggest top 5 most popular articles regardless of the user

In [None]:
recommender.load_top5_articles(train)

In [None]:
recommender.recommend_top5(user)

####  reasonable approach is to use the naive top 5 most popular articles when there is no previous data for the user

## Models explanation

##### Tfidf revision 

In TfIdf if t is the term and h is the headline, then:
    
tf(t,h) = count of t in d / number of words in h
<br>
df(t) = occurrence of t in all headlines
<br>
idf(t) = log(N/ df(t))

And Tfidf is defined as:

tf-idf(t, d) = tf(t, d) * idf(t)

*So TfIdf is high for terms that are frequent in the headline and is penalized if the term is common across multiple headlines*


#### Implementation of TfidfVectorizer with sklearn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
%%time
tfidf_matrix = tfidf.fit_transform(articles.headline)

#### Confirming the TfidfVectorizer matrix is consistent with our expectations

In [None]:
tfidf_matrix

The shape of the matrix is len(articles) by Number of terms across all headlines

In [None]:
df_idf = pd.DataFrame(tfidf.idf_, index=tfidf.get_feature_names(),columns=["idf_weights"]) 
df_idf.sort_values(by=['idf_weights'], ascending=False)

The idf part of Tfidf is working as expected
<br>
Kremlin, monument are very unique words across all headlines, while trump, biden, ... are not

In [None]:
# Sparse matrix to dense, adding the terms back and pringing not zeros
df_tfidf = pd.DataFrame(tfidf_matrix.todense()[0].T, index=tfidf.get_feature_names(),columns=["tfidf"]) 
df_tfidf[df_tfidf.tfidf != 0].sort_values(by='tfidf')

Tfidf is working as expected for the first headline
<br>
"on, as, it, etc" are scrapped because of the *stop_words='english'* option used to build the matrix
<br>
There are not repeating words in the tittle then Tfidf is dominated by idf which is low for "Biden" (common word)

#### Implementation of Similarity Metric

For the similarity metric we can use the cosine function
<br>
Each headline is represented by a vector in our tfidf matrix and the similarity between two healines will be the cosine between the vectors

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

cosine_sim shape is Nheadlines x Nheadlines

##### Bert representation 

BERT stands for Bidirectional Encoder Representations from Transformers it is a pre-train model arquitecture developed on 2018 (https://arxiv.org/abs/1810.04805)

We used the implemenation in the SentenceTransformers libray

## Model evaluation

Approach: The recomendation model is user driven, so I have to  evaluate on every user: 

**test set**: the last article read by every user  

**train set**: The rest of the dataset, the data provides at least 3 articles for every user so train will have at least 2 articles per user


**metric** I will calculate Ranking for the test article - A number from 0 to total number of articles (1455) indicating how the model will rank this article. 0 being the first article to recommend and 1455 the last article to recommend

** Potential improvements to the evaulation:**

Crossvalidation test will give more reliable results

Deploying to prod the model and AB test would be the best possible evalution of the model





### Calculate the Ranks for the TfIdf  model for the users in the test set

In [None]:
%%time
recommender.train_tfidf(articles)

In [None]:
%%time
bert_ranks = test.apply(lambda x: recommender.rank_article( x.user_id, train,  x.article_id) , axis=1 )

### Calculate the Ranks for the Bert model for the users in the test set

In [None]:
%%time
recommender.train_tfidf(articles)

In [None]:
%%time
tfidf_ranks = test.apply(lambda x: recommender.rank_article( x.user_id, train,  x.article_id) , axis=1 )

#### Calculate the ranks takes ~20 mins 

In [None]:
bert_ranks.to_csv('bert.csv')
tfidf_ranks.to_csv('tfidf.csv')

In [None]:
bert_ranks

Saving the ranks to avoid recomputation

In [None]:
test_ranks = test.copy()
test_ranks['NarticlesinTrain'] =  train.groupby(['user_id'])['article_id'].count().reset_index()['article_id']
test_ranks['bert'] = len(articles) - bert_ranks['0']
test_ranks['tfidf'] = len(articles) - tfidf_ranks['0']
test_ranks['NarticlesinTrain'] = test_ranks.NarticlesinTrain.astype('float')
test_ranks

In [None]:
test_ranks['articlesTrain']

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=test_ranks['bert'],name='bert'))
fig.add_trace(go.Histogram(x=test_ranks['tfidf'],name = 'tfidf'))
fig.update_layout(title = 'Histograms of the Ranks given for the test article for both models')

In this graph each bar represents the number of times that an article got ranked at that value

The best model will rank most of these articles near zero. A few observations:
    1. The TfIdf model is better sugessting the top 10 articles. 
    2. The Bert model is better in the 10-700 range
    3. Bert is monotonically decreasing to lower ranks, this what is expected for the model as we want the model to give high number of good predictions and low number of bad ones
    4. TfIdf is not monotonic with some more peak at around 800 -1000, this would indicate the model may not generalize well to new data

In [None]:
px.histogram(test_ranks.sort_values(['NarticlesinTrain']),x='bert',facet_col='NarticlesinTrain', facet_col_wrap=5,title ="Histogram of ranks per N of articles in the train set for the Bert model")

In [None]:
px.histogram(test_ranks.sort_values(['NarticlesinTrain']),x='tfidf',facet_col='NarticlesinTrain', facet_col_wrap=5,title ="Histogram of ranks per N of articles in the train set for the tfidf model")

In the previous plots we partitionate the ranking hisotograms by the Number of articles in the train data for that user:
    1. The 800-1000 unexpected bump in the tfidf model is coming from the users with very low data NarticlesTrain = 2 or 3, this indicates that tfidf does a bad job with very limited data. Bert does not have that issue, even with very low data (2,3,4) the histogram is sill monotonic
    2. For Bert the histogram seem to increase in slop with larger NarticlesinTrain, meaning that the prediction power does benefit from more data, which is not the case for TfIdf
    3. TfIdf shows the 0-10 range peak regardless of the NarticlesinTrain
    
**Bert is probably going to generalize better in both extremes less data and more data pre user**
    

In [None]:
## Check that high ranking values are due to predicting same read articles

In [None]:
def Isrecommendedseen(user, train, articleid):
    prev = recommender.prev_articlesid( user, train)
    return articleid in prev

In [None]:
Isrecommendedseen(users_top5_bert.user_id.iloc[0],train, users_top5_bert.article_id.iloc[0])

In [None]:
%%time
recommender.train_bertertert(articles)

In [None]:
%%time
users_top5_bert = user_Narticles[user_Narticles.bert <6]
users_top5_seen_bert = users_top5_bert.apply(lambda x: Isrecommendedseen(x.user_id,train,x.article_id) , axis=1 )

In [None]:
users_top5_seen_bert.sum()

In [None]:
%%time
recommender.train_tfidf(articles)

In [None]:
%%time
users_top5_tfidf = user_Narticles[user_Narticles.tfidf <6]
users_top5_seen_tfidf = users_top5_tfidf.apply(lambda x: Isrecommendedseen(x.user_id,train,x.article_id) , axis=1 )

In [None]:
test

In [None]:
users_top5_tfidf

In [None]:
Isrecommendedseen(users_top5_tfidf.)

In [None]:
users_top5_seen_tfidf

In [None]:
len(users_top5_bert) , len(users_top5_tfidf)

In [None]:
## peak at 800 rank for tfidf          

In [None]:
user_Narticles

In [None]:
px.histogram(user_Narticles,x='bert')

In [None]:
px.histogram(user_Narticles,x='bert',color_discrete_map=='article_id')

In [None]:
tfidf_ranks_inv = len(articles)-  tfidf_ranks
bert_ranks_inv = len(articles)-  bert_ranks

In [None]:
tfidf_ranks.value_counts()

In [None]:
len(articles)

In [None]:
tfidf_ranks.value_counts().sort_values()

Bert is monotonic while tfidf has a second/rhird peak behavior around 700/550
- The additional peaks maybe caused by matching of words that have no strong significance

Bert predicts more often high ranks (above 200 ranks) However tfidf predicts better for very high ranks 0-15
This maybe possible for overfitting words very specific to this dataset (Trump,Biden, covid)? 
This may not generalize well to new data

In [None]:
bert_ranks_inv.value_counts().sort_values().iloc[-5:]

In [None]:
tfidf_ranks_inv.value_counts().sort_values().iloc[-5:]

In [None]:
tfidf_ranks_inv.value_counts().sort_values()

In [None]:
fig = go.Figure()
#fig.add_trace(go.Histogram(x=bert_ranks,name='bert'))
fig.add_trace(go.Histogram(x=tfidf_ranks,name = 'tfidf'))


In [None]:
px.histogram()

In [None]:
tfidf_ranks.value_counts().sort_values()

In [None]:
bert_ranks.value_counts().sort_values()

In [None]:
## Comparing both prediction modesl

In [None]:
## Model vs Number of articles available per reader

In [None]:
Next improvements to validity metric:

    

Cross validation 
The best approach for this would be AB Testing 
