In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds

from sklearn.preprocessing import binarize
from sklearn.preprocessing import normalize


from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix

from lightfm import LightFM
from lightfm.data import Dataset
import lightfm as lm
from lightfm import cross_validation 
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score, reciprocal_rank



In [2]:
behaviors = pd.read_csv("../../data/mind_small_train/behaviors_processed.csv")
news = pd.read_csv("../../data/mind_small_train/news_processed.csv")

In [3]:
behaviors.drop_duplicates(subset="user_id", inplace=True)
behaviors.head()

Unnamed: 0,impression_id,user_id,time,history,labels
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [4]:
user_article_interactions = behaviors.set_index('user_id').history.str.split(' ', expand =True)
user_article_interactions = user_article_interactions.stack().reset_index(1, drop=True).reset_index(name='article')

In [5]:
user_article_interactions['read'] = 1

In [6]:
user_article_interactions.head()

Unnamed: 0,user_id,article,read
0,U13740,N55189,1
1,U13740,N42782,1
2,U13740,N34694,1
3,U13740,N45794,1
4,U13740,N18445,1


In [None]:
input_sparse_matrix_features_1 = list(news.category)

In [33]:
dataset = Dataset()
dataset.fit(user_article_interactions['user_id'], user_article_interactions['article'], item_features=input_sparse_matrix_features_1)

In [8]:
input_sparse_matrix = list(user_article_interactions.to_records(index=False))

In [9]:
interactions, weights = dataset.build_interactions(input_sparse_matrix)

In [10]:
train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.5, random_state=None)

In [11]:
loss = 'bpr'
no_components = 20
epochs = 20

In [12]:
model_cf = LightFM(no_components=no_components, loss=loss)

In [13]:
model_cf.fit(train, epochs=epochs)

<lightfm.lightfm.LightFM at 0x7fd04be0c630>

In [14]:
def evaluate(train, test, loss = 'warp', no_components = 20, epochs =20):
    model_cf = LightFM(no_components=no_components, loss=loss)
    model_cf.fit(train, epochs=epochs)
    
    auc = np.mean(auc_score(model_cf, test))
    mean_pre = np.mean(precision_at_k(model_cf, test))
    mean_reciprocal_rank = np.mean(reciprocal_rank(model_cf, test))         
                  
    print('The AUC Score is:                   ',auc)
    print('The mean precision at k Score is:   ',mean_pre)
    print('The mean reciprocal rank is:        ',mean_reciprocal_rank)
    print('_______________________________________________________')
    return auc, mean_pre, mean_reciprocal_rank

In [16]:
evaluate(train, test)

The AUC Score is:                    0.91039073
The mean precision at k Score is:    0.032758366
The mean reciprocal rank is:         0.11456854
_______________________________________________________


(0.91039073, 0.032758366, 0.11456854)

### Hybrid model
Now let's see what we can get with a hybrid model. In our case, due to our user data being anonymized and the click history being time insensitive, we can only work with additional information concerning the articles. In order to do this, we will first construct antoher  sparse matrix for articles and their features:

In [17]:
news.head()

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [18]:
articles_in_subset = user_article_interactions.article.unique()
len(articles_in_subset)

32827

In [19]:
articles_in_news = news.iloc[:, 0]
len(articles_in_news)

50434

In [20]:
articles_union = set(articles_in_news) & set(articles_in_subset)
len(articles_union)

32825

In [21]:
extra_in_behaviors = set(articles_in_subset) - set(articles_in_news)
len(extra_in_behaviors)

2

In [22]:
extra_in_behaviors

{'N117002', 'N2325787'}

In [23]:
news[news['article_id'] == 'N117002']

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities


In [24]:
news[news['article_id'] == 'N2325787']

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities


In [26]:
news_articles_subset = news[news['article_id'].isin(articles_union)].copy()

In [27]:
len(news_articles_subset)

32825

In [28]:
news_articles_subset['features'] = [[cat] for cat in news_articles_subset.category]

In [34]:
input_sparse_matrix_features_1 = list(news_articles_subset.category)

#input_sparse_matrix_features_1

In [35]:
article_features = news_articles_subset[['article_id', 'features' ]]

In [36]:
input_sparse_matrix_features = list(article_features.to_records(index=False))
#input_sparse_matrix_features

In [37]:
features = dataset.build_item_features(input_sparse_matrix_features, normalize=True)

In [42]:
def evaluate_hybrid(train, test, loss = 'warp', no_components = 20, epochs =20):
    model_hybrid = LightFM(no_components=no_components, loss=loss)
    model_hybrid.fit(train, item_features= features, epochs=epochs)
    
    auc = np.mean(auc_score(model_hybrid, test, item_features=features))
    mean_pre = np.mean(precision_at_k(model_hybrid, test, item_features=features))
    mean_reciprocal_rank = np.mean(reciprocal_rank(model_hybrid, test, item_features=features))         
                  
    print('The AUC Score is:                   ',auc)
    print('The mean precision at k Score is:   ',mean_pre)
    print('The mean reciprocal rank is:        ',mean_reciprocal_rank)
    print('_______________________________________________________')
    return auc, mean_pre, mean_reciprocal_rank

In [None]:
evaluate_hybrid(train, test)

In [None]:
evaluate_hybrid(train, test, loss = 'brp', no_components = 20, epochs =20)