In [2]:
import pandas as pd
import numpy as np
import heapq

from lightfm import LightFM
from lightfm.data import Dataset
import lightfm as lm
from lightfm import cross_validation 

import warnings
warnings.filterwarnings('ignore')

from lightfmHelper import evaluate



In [None]:
uai = pd.read_csv('small_train.csv')

In [1]:
uai.head()

NameError: name 'uai' is not defined

In [20]:
dataset_cf = Dataset()
dataset_cf.fit(uai['user_id_code'], uai['article_id_code'])

In [36]:
uai_array = uai.to_numpy()

In [49]:
interactions, weights = dataset_cf.build_interactions(
    (ua[2], ua[3]) for ua in uai_array
)

In [50]:
train, test = cross_validation.random_train_test_split(
    interactions, test_percentage=0.5, 
    random_state=np.random.RandomState(42)
)

In [51]:
loss = 'warp'
no_components = 20
epochs = 20

In [52]:
model_cf = LightFM(no_components=no_components, loss=loss)
model_cf.fit(train, epochs=epochs)

<lightfm.lightfm.LightFM at 0x7f8d1e6e8fd0>

In [53]:
result = evaluate(model_cf, train, test)

The AUC Score is in training/validation:                  0.98652005  /  0.90363157
The mean precision at k Score in training/validation is:  0.057673965  /  0.034512468
The mean reciprocal rank in training/validation is:       0.19362602  /  0.121272065
_________________________________________________________


In [54]:
cf_result = result

In [21]:
test_filename = "small_test.csv"
test_positives = []

with open(test_filename, "r") as f:
    header = f.readline()
    print(header)
    line = f.readline()
    print(line)
    while line != None and line != "":
        line_list = line.split(",")
        #print(line_list)
        user, article = int(line_list[2]), int(line_list[3])
        #print(user, article)                                            
        test_positives.append([user, article])
        line = f.readline()

user_id,article,user_id_code,article_id_code

U13740,N31801,1810,11677



In [56]:
len(test_positives), test_positives[0]

(39846, [1810, 11677])

In [22]:
test_neg_filename = "small_test_negatives.tsv"
test_negatives = []

with open(test_neg_filename, "r") as f:
    line = f.readline()
    while line != None and line != "":
        line_list = line.split("\t")
        #print(line_list)
        negatives = []
        for neg in line_list[1: ]:
            negatives.append(int(neg))
        test_negatives.append(negatives)
        line = f.readline()

In [58]:
len(test_negatives)

39846

In [23]:
K = 10

In [24]:
def eval_one_rating(idx, model):
    user = test_positives[idx][0]
    pos_item = test_positives[idx][1]
    items = test_negatives[idx]
    items.append(pos_item)
    
    # Get prediction score
    map_item_score = {}
    user_array = np.full(len(items), user, dtype = 'int32')
    predictions = model.predict(user_array, np.array(items))
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(K, map_item_score, key=map_item_score.get)
    
    if pos_item in ranklist:
        hr = 1
        i = ranklist.index(pos_item)
        ndcg = np.log(2) / np.log(i+2)
        rr = 1/(i+1)
    else:
        hr = 0
        ndcg = 0
        rr = 0
   
    return (hr, ndcg, rr)

In [61]:
hits, ndcgs, rrs = [], [], []
for idx in range(len(test_positives)):
    hr, ndcg, rr = eval_one_rating(idx, model_cf)
    hits.append(hr)
    ndcgs.append(ndcg)
    rrs.append(rr)

In [62]:
hr = np.array(hits).mean()

In [63]:
hr

0.07795010791547458

In [64]:
mrr = np.array(rrs).mean()

In [65]:
mrr

0.023214833458019727

In [66]:
ndcg = np.array(ndcgs).mean()

In [67]:
ndcg

0.03568273165655035

## Hybrid Model

In [47]:
LOSS = "warp"
ITEM_ALPHA = 0.0001
EPOCHS = 20
NO_COMPONENTS = 20

In [26]:
news = pd.read_csv("../../../data/mind_small_train/news_processed.csv")

In [27]:
news_categories = news.category.unique().tolist()

In [28]:
news.head(2)

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."


In [29]:
news_categories

['lifestyle',
 'health',
 'news',
 'sports',
 'weather',
 'entertainment',
 'autos',
 'travel',
 'foodanddrink',
 'tv',
 'finance',
 'movies',
 'video',
 'music',
 'kids',
 'middleeast',
 'northamerica']

In [30]:
article_cat_dict = {}
for row in news.values:
    art, cat = row[0], row[1]
    article_cat_dict[art] = cat
    
for art in ['N2325787', 'N117002']:
    article_cat_dict[art] = "none"

In [31]:
news_categories.append("none")

In [32]:
article_categories = [article_cat_dict[art] for art in uai.article]

In [33]:
dataset_hybrid = Dataset()
dataset_hybrid.fit(uai['user_id_code'], 
                   uai['article_id_code'],
                   item_features=news_categories)

In [34]:
item_features = dataset_hybrid.build_item_features(
    (art_id, [art_category]) for art_id, art_category 
    in zip(uai.article_id_code, article_categories))

In [37]:
interactions_hybrid, weights_hybrid = dataset_hybrid.build_interactions(
    (ua[2], ua[3]) for ua in uai_array)

In [51]:
train_hybrid, test_hybrid = cross_validation.random_train_test_split(
    interactions_hybrid, test_percentage=0.5,
    random_state=np.random.RandomState(42))

In [52]:
model_hybrid = LightFM(no_components=NO_COMPONENTS, 
                       loss=LOSS,
                       item_alpha=ITEM_ALPHA)

model_hybrid.fit(train_hybrid, 
                 item_features=item_features,
                 epochs=EPOCHS)

<lightfm.lightfm.LightFM at 0x7ffa9b7bccc0>

In [53]:
result_hybrid = evaluate(model_hybrid, train_hybrid, test_hybrid, 
                         hybrid=True, features=item_features)

The AUC Score is in training/validation:                  0.9485936  /  0.79966867
The mean precision at k Score in training/validation is:  0.04742687  /  0.021509968
The mean reciprocal rank in training/validation is:       0.19647242  /  0.08661054
_________________________________________________________


In [56]:
hits, ndcgs, rrs = [], [], []
for idx in range(len(test_positives)):
    hr, ndcg, rr = eval_one_rating_hybrid(idx, model_hybrid)
    hits.append(hr)
    ndcgs.append(ndcg)
    rrs.append(rr)

In [57]:
mrr = np.array(rrs).mean()

In [58]:
mrr

0.023362415537916216

In [55]:
def eval_one_rating_hybrid(idx, model):
    user = test_positives[idx][0]
    pos_item = test_positives[idx][1]
    items = test_negatives[idx]
    items.append(pos_item)
    
    # Get prediction score
    map_item_score = {}
    user_array = np.full(len(items), user, dtype = 'int32')
    predictions = model.predict(user_array, np.array(items),
                                item_features=item_features)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(K, map_item_score, key=map_item_score.get)
    
    if pos_item in ranklist:
        hr = 1
        i = ranklist.index(pos_item)
        ndcg = np.log(2) / np.log(i+2)
        rr = 1/(i+1)
    else:
        hr = 0
        ndcg = 0
        rr = 0
   
    return (hr, ndcg, rr)