# Recommendation System Articles sharing and reading from CI&T DeskDrop

### This notebook was based on Susan Li Medium tutorial 
(https://towardsdatascience.com/building-a-collaborative-filtering-recommender-system-with-clickstream-data-dffc86c8c65)

This recommendation system has been implemented using Alternating Least Squares (https://www.cs.cmu.edu/~mgormley/courses/10601-s17/slides/lecture25-mf.pdf)

## Preprocessing

In [13]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler

articles_df = pd.read_csv('shared_articles.csv')
interactions_df = pd.read_csv('users_interactions.csv')

In [14]:
articles_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


In [15]:
percent_missing = articles_df.isnull().sum() * 100 / len(df)
missing_values = pd.DataFrame({'column_name': articles_df.columns,
                               'percent_missing': percent_missing})
missing_values

Unnamed: 0,column_name,percent_missing
timestamp,timestamp,0.0
eventType,eventType,0.0
contentId,contentId,0.0
authorPersonId,authorPersonId,0.0
authorSessionId,authorSessionId,0.0
authorUserAgent,authorUserAgent,3.379042
authorRegion,authorRegion,3.379042
authorCountry,authorCountry,3.379042
contentType,contentType,0.0
url,url,0.0


In [16]:
interactions_df

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
5,1465413742,VIEW,310515487419366995,-8763398617720485024,1395789369402380392,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,MG,BR
6,1465415950,VIEW,-8864073373672512525,3609194402293569455,1143207167886864524,,,
7,1465415066,VIEW,-1492913151930215984,4254153380739593270,8743229464706506141,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
8,1465413762,VIEW,310515487419366995,344280948527967603,-3167637573980064150,,,
9,1465413771,VIEW,3064370296170038610,3609194402293569455,1143207167886864524,,,


In [17]:
percent_missing = interactions_df.isnull().sum() * 100 / len(df)
missing_values = pd.DataFrame({'column_name': interactions_df.columns,
                               'percent_missing': percent_missing})
missing_values

Unnamed: 0,column_name,percent_missing
timestamp,timestamp,0.0
eventType,eventType,0.0
contentId,contentId,0.0
personId,personId,0.0
sessionId,sessionId,0.0
userAgent,userAgent,21.300973
userRegion,userRegion,21.316194
userCountry,userCountry,21.300973


In [18]:
articles_df.drop(['authorUserAgent', 'authorRegion', 'authorCountry'], axis=1, inplace=True)
interactions_df.drop(['userAgent', 'userRegion', 'userCountry'], axis=1, inplace=True)

articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.drop('eventType', axis=1, inplace=True)
df = pd.merge(interactions_df[['contentId','personId', 'eventType']], 
              articles_df[['contentId', 'title']], how = 'inner', on = 'contentId')

df.head()

Unnamed: 0,contentId,personId,eventType,title
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem


In [19]:
df['eventType'].value_counts()

VIEW               61043
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
Name: eventType, dtype: int64

In [21]:
# Changing the eventType column from string to numerical value. A higher value is given to action that shows a greater interest 
# in the article.
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,  
}
df['eventStrength'] = df['eventType'].apply(lambda x: event_type_strength[x])

In [23]:
df = df.drop_duplicates()
grouped_df = df.groupby(['personId', 'contentId', 'title']).sum().reset_index()
grouped_df.sample(10)

Unnamed: 0,personId,contentId,title,eventStrength
24301,1673026171377815127,-78066964941874046,Microservices testing,1.0
9570,-4160173091318455989,-14569272361926584,Java 8 Streams - A Deeper Approach About Perfo...,1.0
370,-9016528795238256703,-1453783314552286835,Progressive Web App - first introduced on Goog...,1.0
25232,2062590537770243336,408569329455206654,These gloves can convert sign language into sp...,1.0
24698,1895326251577378793,-205193648629294862,Is FinTech Forcing Banking to a Tipping Point?,9.0
4686,-7087518691724367681,-14569272361926584,Java 8 Streams - A Deeper Approach About Perfo...,1.0
24741,1895326251577378793,3577117590019014785,Hands-on: Google Assistant's Allo chatbot outd...,1.0
2994,-8036997159314605196,3288699993147470797,Primeiro chatbot do varejo brasileiro no Faceb...,1.0
2955,-8051903121006324833,6222001809679591440,Blockchain startup R3 cuts fund-raising target...,1.0
6143,-6316613156648676087,6015995473246774717,The need to lead in data and analytics,3.0


## Implicit

In [24]:
# 1) Creating a numeric version of personid and contentid to avoid nugative values warnings
# 2) Creating two matrices, one for fitting the model (content-person) and one for recommendations (person-content)
# 3) Initializing the Alternating Least Squares (ALS) recommendation model
grouped_df['title'] = grouped_df['title'].astype("category")
grouped_df['personId'] = grouped_df['personId'].astype("category")
grouped_df['contentId'] = grouped_df['contentId'].astype("category")
# map the values and codes in every rows
grouped_df['person_id'] = grouped_df['personId'].cat.codes
grouped_df['content_id'] = grouped_df['contentId'].cat.codes

sparse_content_person = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), 
                                           (grouped_df['content_id'], grouped_df['person_id'])))
sparse_person_content = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), 
                                           (grouped_df['person_id'], grouped_df['content_id'])))

model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_content_person * alpha).astype('double')
model.fit(data)

100%|████████████████████████████████████████████████████████████████████████████████| 50.0/50 [00:00<00:00, 74.23it/s]


In [32]:
grouped_df.sample(10)

Unnamed: 0,personId,contentId,title,eventStrength,person_id,content_id
36190,6947583688031316012,2106559900295325351,"Applying the Linus Torvalds ""Good Taste"" Codin...",1.0,1647,1802
34668,5853826697811091550,-885886002174762919,"Onion Creates a $5 Linux Computer with Wi-Fi, ...",1.0,1543,1329
33127,4971787747747052991,6244532954645766056,3 Big Blockchain Ideas MIT is Working on Right...,1.0,1456,2497
22723,1070632260016389736,-4944551138301474550,Algorithms and architecture for job recommenda...,1.0,1056,709
24909,1908339160857512799,1233935898842140014,Feito por Elas #01 Agnieszka Holland | AntiCast,3.0,1147,1672
20879,292607709308972335,-9056114023474725450,Acquia Engage 2016: Day One,1.0,967,38
16464,-1443636648652872475,1954074927376897165,Swarm A.I. Correctly Predicts the Kentucky Der...,10.0,798,1778
18830,-634318177541615985,-662806181534790446,Wells Fargo to Verify Customers through Eye Pr...,1.0,865,1365
7214,-5706287032724665714,7544768317373280661,Google adiciona medidor de velocidade da inter...,1.0,348,2716
2251,-8550167523008133722,7359386552064322134,E os vencedores do Latam Founders Awards 2016 ...,1.0,66,2674


In [29]:
content_id = 187
n_similar = 10

person_vecs = model.user_factors
content_vecs = model.item_factors

content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

scores = content_vecs.dot(content_vecs[content_id]) / content_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[content_id]), key=lambda x: -x[1])

for content in similar:
    idx, score = content
    print(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])

Ray Kurzweil: The world isn't getting worse - our information is getting better
Livro: Retrospectivas Divertidas
Novo workaholic trabalha, pratica esportes e tem tempo para a família. Conheça
The brilliant mechanics of Pokémon Go
Por que a limitação da banda larga é uma forma de ignorar o futuro
The Broken Window Theory
A step-by-step guide to agile growth experiments
Do You Suffer From Deployment Anxiety? - DZone DevOps
Explorando o novo .NET multiplataforma: ASP.NET Core, .NET Core e EF Core
Microsoft adquire LinkedIn por US$ 26,2 bilhões


In [30]:
def recommend(person_id, sparse_person_content, person_vecs, content_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    titles = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        titles.append(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'title': titles, 'score': scores})

    return recommendations

In [33]:
# Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for person with id 1456
person_id = 1456

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

                                               title     score
0  Grupo de bancos adere à tecnologia do bitcoin ...  0.996585
1                                 Aposta na inovação  0.962688
2  Cinco competências essenciais ao IT Leaders - CIO  0.950309
3  Java 8 Streams - A Deeper Approach About Perfo...  0.938271
4                     Abrimos uma conta no Original.  0.923425
5                         Changing change management  0.915626
6  Nova regra do Banco Central pode significar o ...  0.900172
7  Welcome to GoogleBank, Facebook Bank, Amazon B...  0.888650
8  Monitoramento em tempo real com Elasticsearch ...  0.881843
9  Cade autoriza Itaú Unibanco e Mastercard a cri...  0.881833


In [34]:
grouped_df.loc[grouped_df['person_id'] == 1456].sort_values(by=['eventStrength'], 
                                                          ascending=False)[['title', 'person_id', 'eventStrength']].head(10)

Unnamed: 0,title,person_id,eventStrength
33125,Itaú é pioneiro do blockchain,1456,1.0
33126,"Por dentro do Nubank, conheça os segredos da f...",1456,1.0
33127,3 Big Blockchain Ideas MIT is Working on Right...,1456,1.0


## Evaluation

### Following Jupiter nbviewer guide: https://nbviewer.jupyter.org/github/jmsteinw/Notebooks/blob/master/RecEngine_NB.ipynb

In [35]:
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list

    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    content_inds = [index[0] for index in samples] # Get the item row indices

    person_inds = [index[1] for index in samples] # Get the user column indices

    
    training_set[content_inds, person_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(person_inds))

In [36]:
content_train, content_test, content_persons_altered = make_train(sparse_content_person, pct_test = 0.2)

In [37]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [38]:
def calc_mean_auc(training_set, altered_persons, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    content_vecs = predictions[1]
    for person in altered_persons: # Iterate through each user that had an item altered
        training_column = training_set[:,person].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        person_vec = predictions[0][person,:]
        pred = person_vec.dot(content_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,person].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_contents[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [40]:
from sklearn import metrics
calc_mean_auc(content_train, content_persons_altered,
              [person_vecs, content_vecs.T], content_test)

(0.981, 0.819)