In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

import pickle
import random

In [4]:
# def read_g_obj(file="adj_matrices/G_hci.pkl"):
#     with open(file, "rb") as pfile: 
#         G = pickle.load(pfile)
    
#     follows_at_least_10 = [person for person, out_degree in G.out_degree() if out_degree >= 10] 
    
#     subgraph_hci = nx.subgraph(G, follows_at_least_10)
    
#     return subgraph_hci

In [5]:
# subgraph_hci = read_g_obj()

In [6]:
# with open("train_test/test.pkl", "rb") as pfile:
#     test = pickle.load(pfile)

In [7]:
# with open("train_test/train.pkl", "rb") as pfile:
#     train = pickle.load(pfile)

In [8]:
# with open("train_test/anti_test.pkl", "rb") as pfile:
#     anti_test = pickle.load(pfile)

### Factorization machine

In [2]:
import sys
!{sys.executable} -m pip install git+https://github.com/coreylynch/pyFM

Collecting git+https://github.com/coreylynch/pyFM
  Cloning https://github.com/coreylynch/pyFM to /tmp/pip-req-build-tnr7fyx2
  Running command git clone --filter=blob:none --quiet https://github.com/coreylynch/pyFM /tmp/pip-req-build-tnr7fyx2
  Resolved https://github.com/coreylynch/pyFM to commit 0696c980993889a9429e4ab0b6c7dc8be6dac4de
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pyfm
  Building wheel for pyfm (setup.py) ... [?25ldone
[?25h  Created wheel for pyfm: filename=pyfm-0.0.0-cp310-cp310-linux_x86_64.whl size=65075 sha256=561d2006c161baad535d1ea595d9074af0745edacd9b72c6a6aba5e8822874f0
  Stored in directory: /tmp/pip-ephem-wheel-cache-t5rdcdj9/wheels/fa/5d/da/7f914f89db79e7442033d9c67bff7973fc17b514b7f379a4f7
Successfully built pyfm
Installing collected packages: pyfm
Successfully installed pyfm-0.0.0


In [3]:
from sklearn.feature_extraction import DictVectorizer
from pyfm import pylibfm

#### Loading the data

In [4]:
def loadData(filename,path="", sample=1.0):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        next(f)
        for line in f:
            (index,user,item,rating)=line.split(',')
            if random.random() <= sample:
                data.append({ "user": str(user), "item": str(item)})
                y.append(float(rating))
                users.add(user)
                items.add(item)

    return (data, np.array(y), users, items)

In [5]:
(train_data, y_train, train_users, train_items) = loadData("train_df.csv")

In [6]:
(test_data, y_test, test_users, test_items) = loadData("test_df.csv") #contains both test and anti-test

In [7]:
train_data[:5]

[{'user': 'cqz', 'item': 'jbigham'},
 {'user': 'cqz', 'item': 'ryanatkn'},
 {'user': 'cqz', 'item': 'axz'},
 {'user': 'cqz', 'item': 'msbernst'},
 {'user': 'cqz', 'item': 'qli'}]

In [8]:
test_data[:5]

[{'user': 'cqz', 'item': 'kentrellowens'},
 {'user': 'cqz', 'item': 'ruotongw'},
 {'user': 'cqz', 'item': 'schaferj'},
 {'user': 'Gillian', 'item': 'kgajos'},
 {'user': 'Gillian', 'item': 'andreaforte'}]

In [24]:
from itertools import permutations

#convert the combination_result to sets
X_train_data_extended = [{'user': pair[0], 'item': pair[1]} for pair in permutations(list(train_users), 2)]

In [25]:
len(X_train_data_extended) #this contains all the possible edge options

759512

In [26]:
#check if value in train_data
{'user': 'cqz', 'item': 'jbigham'} in X_train_data_extended

True

In [32]:
y_train_extended_list = []

for comb in X_train_data_extended:
    if comb in train_data:
        y_train_extended_list.append(1)
    else:
        y_train_extended_list.append(0)

In [33]:
len(y_train_extended_list)

759512

In [39]:
print(len(y_train))
print(y_train_extended_list.count(1))

22480
22480


In [48]:
print(len(X_train_data_extended))
print(len(y_train_extended_list))

759512
759512


In [40]:
# convert to an array for pylib.FM
y_train_data_extended = np.array(y_train_extended_list, dtype='double')

#### Prepare the data

In [42]:
v = DictVectorizer()
X_train = v.fit_transform(X_train_data_extended)
X_test = v.transform(test_data)

In [43]:
fm = pylibfm.FM (num_factors=10, 
                 num_iter=10, 
                 verbose=True, 
                 task="regression", 
                 initial_learning_rate=0.001, 
                 learning_rate_schedule="optimal")

fm.fit(X_train, y_train_data_extended)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.01335
-- Epoch 2
Training MSE: 0.01265
-- Epoch 3
Training MSE: 0.01253
-- Epoch 4
Training MSE: 0.01247
-- Epoch 5
Training MSE: 0.01244
-- Epoch 6
Training MSE: 0.01241
-- Epoch 7
Training MSE: 0.01239
-- Epoch 8
Training MSE: 0.01237
-- Epoch 9
Training MSE: 0.01236
-- Epoch 10
Training MSE: 0.01234


In [45]:
preds = fm.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, preds)

0.8096168309678552

In [None]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

### Evaluation

In [52]:
from abc import ABC, abstractmethod
from collections import defaultdict

In [86]:
class UserRecommendations:
    
    def __init__(self):
        self.recs = []
        
    def add_entry(self, entry):
        self.recs.append(entry)
        
    def select_top(self, k):
        self.recs = sorted(self.recs, key=lambda entry: entry[2], reverse=True)
        if len(self.recs) > k:
            self.recs = self.recs[0:k]
            
        
class TestRecommendations:
    
    def __init__(self):
        self.test_recs = defaultdict(UserRecommendations)
        
    def setup(self, preds, k):
        for entry in preds:
            user = entry.uid
            self.test_recs[user].add_entry(entry)
                   
        for user in self.test_recs.keys():
            self.test_recs[user].select_top(k)
            
    def add_entry(self, user, entry):
        self.test_recs[user].add_entry(entry)
        
    def select_top(self, user, k):
        self.test_recs[user].select_top(k)
            
    def iter_recs(self):
        for user in self.test_recs.keys():
            yield (user, self.test_recs[user].recs)


In [87]:
class Evaluator(ABC):
    
    def __init__(self):
        self.results_table = None
        self.score = None
        self.m_score = None
        self.f_score = None
        
    def setup(self, trainset, testset):
        pass
    
    @abstractmethod
    def evaluate_user(self, user, user_recs):
        pass
    
    def evaluate(self, test_recs: TestRecommendations):
        scores = []
        self.results_table = {}
        for user, recs in test_recs.iter_recs():
            score = self.evaluate_user(user, recs)
            scores.append(score)
            self.results_table[user] = score
        self.score = np.mean(scores)
    
    
class ItemwiseEvaluator(Evaluator):
    
    def __init__(self):
        super().__init__()
    
    def evaluate_user(self, user, user_recs):
        return np.mean([self.evaluate_pred(rec) for rec in user_recs])
        
    @abstractmethod
    def evaluate_pred(self, pred):
        pass
    
class ListwiseEvaluator(Evaluator):
    
    def __init__(self):
        super().__init__()
    
    @abstractmethod
    def evaluate_user(self, user, user_recs):
        pass

In [88]:
class NDCGEvaluator(ListwiseEvaluator):
    
    def __init__(self, k):
        super().__init__()
        self.rated_table = defaultdict(set)
        self.idcg_table = {}
        self.log_table = {}
        self.list_len = k
        self.users_g_lookup = dict()
    
    # compute idcg
    def setup(self, trainset, testset):
        for entry in testset:
            self.rated_table[entry['user']].add(entry['item'])
        idcg = 0
        for i in range(0, self.list_len+1):
            self.idcg_table[i] = idcg
            rank_utility = 1 / np.log(i+2)
            self.log_table[i] = rank_utility
            idcg += rank_utility

    
    def evaluate_user(self, user, user_recs):  #called by evalueate
        
        dcg = 0.0
        for i, pred in enumerate(user_recs):
            if pred[1] in self.rated_table[user]:
                dcg = self.log_table[i]
        
        idcg = 0
        if len(self.rated_table[user]) >= self.list_len:
               idcg = self.idcg_table[self.list_len]
        else:
               idcg = self.idcg_table[len(self.rated_table[user])]
            
        if idcg == 0:
            return 0
        return dcg/idcg
                

In [89]:
import random
random.seed(20231110)

def create_prediction_profiles(test_data, train_items, predict_list_len, frac=1.0):
    train_items_lst = list(train_items) # Can't sample from a set
    user_test_profile = defaultdict(set) # add the test data
    for entry in test_data:
        user_id = entry['user']
        item_id = entry['item']
        user_test_profile[user_id].add(item_id)
    
    test_users = list(user_test_profile.keys()) # sample from the test users
    
    test_users_select = random.sample(test_users, int(frac*len(test_users))) # create a big set and add the test users
                      
    user_predict_profile = {}
    
    for user in test_users_select:
        profile = user_test_profile[user]
        sample_items = list(random.sample(train_items_lst, predict_list_len + len(profile)))
        sample_items = sample_items + list(profile)
        user_predict_profile[user] = sample_items
        
    return user_predict_profile
    
# creating a list of recommendations for a user 
def create_test_recommendations(predict_fn, vectorizer, test_data, list_len, train_items, predict_list_len, frac=1.0):
    user_predict_profile = create_prediction_profiles(test_data, train_items, predict_list_len, frac)
    
    trecs = TestRecommendations()
    
    # for all the usrs and items in the profile
    for user, profile in user_predict_profile.items():
        for item in profile:
            x_test = vectorizer.transform({'user': user, 'item': item})
            pred = predict_fn.predict(x_test)[0]
            trecs.add_entry(user, (user, item, pred))
        trecs.select_top(user, list_len)
        
    return trecs # return recommendations list for each user

In [90]:
list_len = 10 # number of recommendations to return
predict_list_len = 100
frac = 0.1

In [91]:
test_recs = create_test_recommendations(fm, v, test_data, list_len, train_items, predict_list_len, frac)

In [92]:
list(test_recs.iter_recs())[:3]

[('ham',
  [('ham', 'axz', 0.5428026143777659),
   ('ham', 'andresmh', 0.45756005703787817),
   ('ham', 'andresmh', 0.45756005703787817),
   ('ham', 'jordant', 0.1921773300889609),
   ('ham', 'depstein', 0.08467367505747984),
   ('ham', 'reviewer2', 0.08235060794027185),
   ('ham', 'haimson', 0.0788232255466597),
   ('ham', 'juhokim', 0.07647523955884497),
   ('ham', 'franziroesner', 0.06688551829899246),
   ('ham', 'oulasvirta', 0.06682682010594126)]),
 ('annetropy',
  [('annetropy', 'cfiesler', 0.4069386188927509),
   ('annetropy', 'sigchi', 0.29718505658538735),
   ('annetropy', 'carolinerpitt', 0.10750016370876184),
   ('annetropy', 'katta', 0.09140284062502697),
   ('annetropy', 'alextaylor', 0.07705169029665822),
   ('annetropy', 'upol', 0.062462675771883996),
   ('annetropy', 'justin', 0.06144226147565379),
   ('annetropy', 'juchidiuno', 0.05779938881773508),
   ('annetropy', 'garreth', 0.05508062036638522),
   ('annetropy', 'landay', 0.04530548045091126)]),
 ('chitalyconf',
  [

In [93]:
ndcg = NDCGEvaluator(10)

In [94]:
ndcg.setup(train_data, test_data)

In [95]:
ndcg.evaluate(test_recs)

In [96]:
ndcg.score

0.09090720906211562