In [218]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds

from sklearn.preprocessing import binarize
from sklearn.preprocessing import normalize


from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix

from lightfm import LightFM
from lightfm.data import Dataset
import lightfm as lm
from lightfm import cross_validation 
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score, reciprocal_rank

import heapq

In [2]:
behaviors = pd.read_csv("../../data/mind_small_train/behaviors_processed.csv")
news = pd.read_csv("../../data/mind_small_train/news_processed.csv")

In [3]:
behaviors.drop_duplicates(subset="user_id", inplace=True)
behaviors.head()

Unnamed: 0,impression_id,user_id,time,history,labels
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [4]:
user_article_interactions = behaviors.set_index('user_id').history.str.split(' ', expand =True)
user_article_interactions = user_article_interactions.stack().reset_index(1, drop=True).reset_index(name='article')

In [5]:
user_article_interactions['read'] = 1

In [6]:
user_article_interactions.head()

Unnamed: 0,user_id,article,read
0,U13740,N55189,1
1,U13740,N42782,1
2,U13740,N34694,1
3,U13740,N45794,1
4,U13740,N18445,1


In [7]:
input_sparse_matrix_features_1 = list(news.category)

In [8]:
dataset = Dataset()
dataset.fit(user_article_interactions['user_id'], user_article_interactions['article'], item_features=input_sparse_matrix_features_1)

In [9]:
input_sparse_matrix = list(user_article_interactions.to_records(index=False))

In [10]:
interactions, weights = dataset.build_interactions(input_sparse_matrix)

In [11]:
train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.5, random_state=None)

In [12]:
loss = 'bpr'
no_components = 20
epochs = 20

In [13]:
model_cf = LightFM(no_components=no_components, loss=loss)

In [14]:
model_cf.fit(train, epochs=epochs)

<lightfm.lightfm.LightFM at 0x7feaf46b3c18>

In [16]:
def evaluate(train, test, model):
    
    
    auc_train = np.mean(auc_score(model, train))
    pre_train = np.mean(precision_at_k(model, train))
    mrr_train = np.mean(reciprocal_rank(model, train))
    
    auc_test = np.mean(auc_score(model, test))
    pre_test = np.mean(precision_at_k(model, test))
    mrr_test = np.mean(reciprocal_rank(model, test))    
    
    res_dict = {'auc_train': auc_train, 'pre_train': pre_train, 'mrr_train': mrr_train, 
                'auc_test': auc_test, 'pre_test': pre_test, 'mrr_test': mrr_test}
                  
    print('The AUC Score is in training/validation:                   ',auc_train,' / ', auc_test)
    print('The mean precision at k Score in training/validation is:   ',pre_train, ' / ', pre_test)
    print('The mean reciprocal rank in training/validation is:        ',mrr_train, ' / ', mrr_test)
    print('_______________________________________________________')
    
    
    return res_dict

In [17]:
bpr_dict = evaluate(train, test, model_cf)

The AUC Score is in training/validation:                    0.9282328  /  0.6116126
The mean precision at k Score in training/validation is:    0.16425708  /  0.019774096
The mean reciprocal rank in training/validation is:         0.69819075  /  0.074633285
_______________________________________________________


In [17]:
model_cf_warp = LightFM(no_components=no_components, loss='warp')

In [18]:
model_cf_warp.fit(train, epochs=epochs)

<lightfm.lightfm.LightFM at 0x7fc1e1377cc0>

In [19]:
warp_dict = evaluate(train, test, model_cf_warp)

The AUC Score is in training/validation:                    0.988031  /  0.90863866
The mean precision at k Score in training/validation is:    0.056809105  /  0.0329247
The mean reciprocal rank in training/validation is:         0.19342019  /  0.11589691
_______________________________________________________


In [20]:
model_cf_warpkaos = LightFM(no_components=no_components, loss='warp-kos')

In [22]:
model_cf_warpkaos.fit(train,epochs = epochs)

<lightfm.lightfm.LightFM at 0x7fc1e1377fd0>

In [23]:
warpkaos_dict = evaluate(train, test, model_cf_warpkaos)

The AUC Score is in training/validation:                    0.96725214  /  0.8893001
The mean precision at k Score in training/validation is:    0.044080418  /  0.016870283
The mean reciprocal rank in training/validation is:         0.16919291  /  0.06081621
_______________________________________________________


In [24]:
model_cf_log = LightFM(no_components=no_components, loss='logistic')

In [25]:
model_cf_log.fit(train,epochs = epochs)

<lightfm.lightfm.LightFM at 0x7fc1a61339e8>

In [26]:
log_dict = evaluate(train, test, model_cf_log)

The AUC Score is in training/validation:                    0.9354671  /  0.92300755
The mean precision at k Score in training/validation is:    0.033999246  /  0.03380575
The mean reciprocal rank in training/validation is:         0.11626061  /  0.11625342
_______________________________________________________


In [15]:
news.shape

(50434, 8)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [None]:
behaviors['history'] = behaviors.history.str.split(' ')

In [17]:
news['news_text'] = news['title'] + ' ' + news['abstract']

In [18]:
tfidf = TfidfVectorizer(stop_words='english')

In [19]:
text_matrix= tfidf.fit_transform(news['news_text'].apply(lambda x: np.str_(x)))

In [20]:
text_matrix.shape

(50434, 54324)

In [21]:
text_matrix[0][:]

<1x54324 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [22]:
similarity_matrix = linear_kernel(text_matrix,text_matrix)
similarity_matrix.shape

(50434, 50434)

In [23]:
mapping_id = pd.Series(news.index,index = news['article_id'])
mapping_id['N16909']

50429

In [96]:
news_2 = news.set_index('article_id')
mapping_title = pd.Series(news_2.title)
mapping_title['N53526']

"I Was An NBA Wife. Here's How It Affected My Mental Health."

In [217]:
def recommended_articles(news_id, k=10):
    news_index = mapping_id[news_id]
    similarity_score = list(enumerate(similarity_matrix[news_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    similarity_score = similarity_score[1:k+1]
    news_indices = [i[0] for i in similarity_score]
    return (news['article_id'].iloc[news_indices].values)  #news['title'].iloc[news_indices])


In [26]:
recommender_text('N24217')

array(['N48828', 'N13856', 'N16308', 'N33131', 'N23206', 'N389', 'N1634',
       'N34192', 'N29952', 'N41317'], dtype=object)

In [27]:
from progressbar import ProgressBar

In [97]:
recommendations[2], behaviors.history.loc[2]

(array(['N1132', 'N36457', 'N56069', 'N24940', 'N14043', 'N22146',
        'N60826', 'N58604', 'N14982', 'N40112'], dtype=object),
 ['N10732',
  'N25792',
  'N7563',
  'N21087',
  'N41087',
  'N5445',
  'N60384',
  'N46616',
  'N52500',
  'N33164',
  'N47289',
  'N24233',
  'N62058',
  'N26378',
  'N49475',
  'N18870'])

In [103]:
mapping_title['N1132'],mapping_title['N10732'], mapping_title['N36457'], mapping_title['N14043']

("Squirrels stashed over 200 walnuts under couple's car hood",
 "Couple Didn't Know Why Car Was Running Strangely   Then They Popped The Hood",
 '2021 Ford F-150 Spied Towing A Trailer Over The Rocky Mountains',
 'Butler County-produced engine wins $4B order')

In [214]:
def get_recs_tfidf_first(user_id):
    used_article = user_lookup.loc[user_id].history[0]
    recs_for_article = recommended_articles(used_article)
    article_titles = []
    
    for article in recs_for_article:
        article_titles.append(mapping_title[article])
    
    
    hits = 0
    for article in recs_for_article:
        if article in user_lookup.loc[user_id].history:
            hits += 1
    
    print(f'The first read article of user {user_id} was: \n {mapping_title[used_article]}')
    print('_______________________________________________________')
    print(f'The suggested articles are:\n')
    for article in article_titles:
        print(article) 
   # print(f'The suggested articles are: {str(article_titles)}',)
    
    
    #return recs_for_article, hits, article_titles

In [215]:
user_lookup = behaviors.set_index('user_id').copy()
user_lookup.loc['U13740'].history[0]

'N55189'

In [216]:
get_recs_tfidf_first('U13740')

The first read article of user U13740 was: 
 'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction
_______________________________________________________
The suggested articles are:

Best Response Ever From a 'Wheel of Fortune' Contestant?
Viral Wheel of Fortune Contestant and His Wife Clarify Hilarious 'Loveless Marriage' Intro
'Wheel Of Fortune' Host Pat Sajak Recovers After Surgery
'Wheel Of Fortune' Host Pat Sajak Undergoes Emergency Surgery; Vanna White Hosts Temporarily
Wheel Of Fortune's Pat Sajak Undergoes 'Successful Emergency Surgery'
Wheel Of Fortune's Pat Sajak Says 'Worst Has Passed' After Emergency Surgery Last Week
Pat Sajak recovering from emergency surgery
'Wheel of Fortune' fans can't believe all three contestants missed puzzle
Wheel of Fortune's Pat Sajak Says the 'Worst Has Passed' Following Emergency Intestine Surgery
ICYMI: The week in TV news for Oct. 13-19, 2019


In [51]:
len(x), behaviors.shape[0]

(49108, 49108)

In [94]:
behaviors.head()

Unnamed: 0,impression_id,user_id,time,history,labels
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


### Hybrid model
Now let's see what we can get with a hybrid model. In our case, due to our user data being anonymized and the click history being time insensitive, we can only work with additional information concerning the articles. In order to do this, we will first construct antoher  sparse matrix for articles and their features:

In [17]:
news.head()

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [102]:
articles_in_subset = user_article_interactions.article.unique()
len(articles_in_subset)

32827

In [103]:
articles_in_news = news.iloc[:, 0]
len(articles_in_news)

50434

In [104]:
articles_union = set(articles_in_news) & set(articles_in_subset)
len(articles_union)

32825

In [105]:
extra_in_behaviors = set(articles_in_subset) - set(articles_in_news)
len(extra_in_behaviors)

2

In [106]:
extra_in_behaviors

{'N117002', 'N2325787'}

In [107]:
news[news['article_id'] == 'N117002']

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,news_text


In [108]:
news[news['article_id'] == 'N2325787']

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,news_text


In [109]:
news_articles_subset = news[news['article_id'].isin(articles_union)].copy()

In [110]:
len(news_articles_subset)

32825

In [111]:
news_articles_subset['features'] = [[cat] for cat in news_articles_subset.category]

In [126]:
input_sparse_matrix_features_1 = list(news_articles_subset.category)

#input_sparse_matrix_features_1

In [127]:
article_features = news_articles_subset[['article_id', 'features' ]]

In [128]:
input_sparse_matrix_features = list(article_features.to_records(index=False))
#input_sparse_matrix_features

In [129]:
features = dataset.build_item_features(input_sparse_matrix_features, normalize=True)

In [131]:
def evaluate_hybrid(train, test, loss = 'warp', no_components = 20, epochs =20):
    model_hybrid = LightFM(no_components=no_components, loss=loss)
    model_hybrid.fit(train, item_features= features, epochs=epochs)
    
    auc = np.mean(auc_score(model_hybrid, test, item_features=features))
    mean_pre = np.mean(precision_at_k(model_hybrid, test, item_features=features))
    mean_reciprocal_rank = np.mean(reciprocal_rank(model_hybrid, test, item_features=features))         
                  
    print('The AUC Score is:                   ',auc)
    print('The mean precision at k Score is:   ',mean_pre)
    print('The mean reciprocal rank is:        ',mean_reciprocal_rank)
    print('_______________________________________________________')
    return auc, mean_pre, mean_reciprocal_rank

In [116]:
model_hybrid_bpr = LightFM(no_components=no_components, loss=loss)

In [117]:
model_hybrid_bpr.fit(train, item_features= features, epochs=epochs)

<lightfm.lightfm.LightFM at 0x7fd64b46f390>

In [125]:
evaluate(train, test, model_hybrid_bpr)

ValueError: Incorrect number of features in item_features

In [132]:
evaluate_hybrid(train, test)

The AUC Score is:                    0.90457815
The mean precision at k Score is:    0.030981
The mean reciprocal rank is:         0.10764621
_______________________________________________________


(0.90457815, 0.030981, 0.10764621)

In [133]:
evaluate_hybrid(train, test, loss = 'bpr')

The AUC Score is:                    0.65354455
The mean precision at k Score is:    0.016497143
The mean reciprocal rank is:         0.06738684
_______________________________________________________


(0.65354455, 0.016497143, 0.06738684)