# Running factorization machine

### We're likely running an FM instead of a matrix factorization

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

import pickle
import random

### These are older files and will revisit

In [22]:
def read_g_obj(file="adj_matrices/G_hci.pkl"):
    with open(file, "rb") as pfile: 
        G = pickle.load(pfile)
    
    follows_at_least_10 = [person for person, out_degree in G.out_degree() if out_degree >= 10] 
    
    subgraph_hci = nx.subgraph(G, follows_at_least_10)
    
    return subgraph_hci

In [23]:
subgraph_hci = read_g_obj()

In [24]:
with open("train_test/test.pkl", "rb") as pfile:
    test = pickle.load(pfile)

In [25]:
with open("train_test/train.pkl", "rb") as pfile:
    train = pickle.load(pfile)

In [26]:
with open("train_test/anti_test.pkl", "rb") as pfile:
    anti_test = pickle.load(pfile)

### Factorization machine

In [7]:
import sys
!{sys.executable} -m pip install git+https://github.com/coreylynch/pyFM

Collecting git+https://github.com/coreylynch/pyFM
  Cloning https://github.com/coreylynch/pyFM to /tmp/pip-req-build-605cnacg
  Running command git clone --filter=blob:none --quiet https://github.com/coreylynch/pyFM /tmp/pip-req-build-605cnacg
  Resolved https://github.com/coreylynch/pyFM to commit 0696c980993889a9429e4ab0b6c7dc8be6dac4de
  Preparing metadata (setup.py) ... [?25ldone
[?25h

In [8]:
from sklearn.feature_extraction import DictVectorizer
from pyfm import pylibfm

#### Loading the data

In [9]:
def loadData(filename,path="", sample=1.0):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        next(f)
        for line in f:
            (index,user,item,rating)=line.split(',')
            if random.random() <= sample:
                data.append({ "user": str(user), "item": str(item)})
                y.append(float(rating))
                users.add(user)
                items.add(item)

    return (data, np.array(y), users, items)

In [10]:
(train_data, y_train, train_users, train_items) = loadData("train_test/train_df.csv")

In [11]:
(test_data, y_test, test_users, test_items) = loadData("train_test/test_df.csv") #contains both test and anti-test

In [12]:
train_data[:5]

[{'user': 'cqz', 'item': 'jbigham'},
 {'user': 'cqz', 'item': 'ryanatkn'},
 {'user': 'cqz', 'item': 'axz'},
 {'user': 'cqz', 'item': 'msbernst'},
 {'user': 'cqz', 'item': 'qli'}]

In [13]:
test_data[:5]

[{'user': 'cqz', 'item': 'kentrellowens'},
 {'user': 'cqz', 'item': 'ruotongw'},
 {'user': 'cqz', 'item': 'schaferj'},
 {'user': 'Gillian', 'item': 'kgajos'},
 {'user': 'Gillian', 'item': 'andreaforte'}]

### This is needed to collect the data in fm format

This is constructing a full matrix where our rows are our users and the columns is every single pair of their connections. 
List of dictionaries that are user (person) and the items (every person that they could follow)

In [89]:
# TODO: Self-edges? Is this something we are addressing. We shouldn't have MattNicholson --> MattNicholson
from itertools import permutations

#convert the combination_result to sets
X_train_data_extended = [{'user': pair[0], 'item': pair[1]} for pair in permutations(list(train_users), 2)]
# X_train_data_extended_tuples = [(u, v) for u, v in permutations(train_users, 2)]


In [17]:
len(X_train_data_extended) #this contains all the possible edge options

759512

In [18]:
#check if value in train_data
{'user': 'cqz', 'item': 'jbigham'} in X_train_data_extended

True

### Produce our training examples 
These are all 1's or 0's and it is the length of all permutations in our dataset.

In [87]:
# tuple(X_train_data_extended_array[0].values())

In [88]:
# train[:5]

In [86]:
# Vectorizing this.
# If the pair is in our list of dictionaries, then that spot should be 1, otherwise it should be 0.
# X_train_data_extended_array = np.array(X_train_data_extended)
# X_train_data_extended_array[0]

In [85]:
# np.where([('edenshaveet', 'troutman'), ('cqz', 'jbigham')] in train)

In [84]:
# y_train = pd.Series(X_train_data_extended_tuples).apply(lambda pair: 1 if pair in train else 0).values

In [51]:
y_train_extended_list = []

for comb in X_train_data_extended:
    if comb in train_data:
        y_train_extended_list.append(1)
    else:
        y_train_extended_list.append(0)

In [57]:
len(y_train_extended_list)

759512

In [176]:
unique, counts = np.unique(y_train_extended_list, return_counts=True)
print(np.asarray((unique, counts)).T)

[[     0 737032]
 [     1  22480]]


In [178]:
# y_train_extended_list <-- This just produces a list with the values

In [58]:
print(len(y_train))
print(y_train_extended_list.count(1))

22480
22480


In [59]:
print(len(X_train_data_extended))
print(len(y_train_extended_list))

759512
759512


In [60]:
# convert to an array for pylib.FM
y_train_data_extended = np.array(y_train_extended_list, dtype='double')

In [62]:
with open('train_test/y_train_extended_list.pkl', 'wb') as f:
    pickle.dump(y_train_extended_list, f)
    
with open('train_test/X_train_data_extended.pkl', 'wb') as f:
    pickle.dump(X_train_data_extended, f)

In [None]:
x

#### Prepare the data

In [63]:
v = DictVectorizer()
X_train = v.fit_transform(X_train_data_extended)
X_test = v.transform(test_data)

In [64]:
X_train

<759512x1744 sparse matrix of type '<class 'numpy.float64'>'
	with 1519024 stored elements in Compressed Sparse Row format>

In [65]:
fm = pylibfm.FM (num_factors=10, 
                 num_iter=5, 
                 verbose=True, 
                 task="regression", 
                 initial_learning_rate=0.001, 
                 learning_rate_schedule="optimal")

fm.fit(X_train, y_train_data_extended)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.01336
-- Epoch 2
Training MSE: 0.01266
-- Epoch 3
Training MSE: 0.01254
-- Epoch 4
Training MSE: 0.01249
-- Epoch 5
Training MSE: 0.01245


In [71]:
preds = fm.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, preds)

0.5

In [72]:
# with open('model_1.pkl', 'wb') as file:
#     pickle.dump(model, file)

### Evaluation

In [73]:
import sys
!{sys.executable} -m pip install surprise



In [74]:
from abc import ABC, abstractmethod
from collections import defaultdict
from surprise import SVD, Reader, Dataset, Prediction

In [75]:
class UserRecommendations:
    
    def __init__(self):
        self.recs = []
        
    def add_entry(self, entry):
        self.recs.append(entry)
        
    def select_top(self, k):
        self.recs = sorted(self.recs, key=lambda entry: entry[2], reverse=True)
        if len(self.recs) > k:
            self.recs = self.recs[0:k]
            
        
class TestRecommendations:
    
    def __init__(self):
        self.test_recs = defaultdict(UserRecommendations)
        
    def setup(self, preds, k):
        for entry in preds:
            user = entry.uid
            self.test_recs[user].add_entry(entry)
                   
        for user in self.test_recs.keys():
            self.test_recs[user].select_top(k)
            
    def add_entry(self, user, entry):
        self.test_recs[user].add_entry(entry)
        
    def select_top(self, user, k):
        self.test_recs[user].select_top(k)
            
    def iter_recs(self):
        for user in self.test_recs.keys():
            yield (user, self.test_recs[user].recs)


In [76]:
class Evaluator(ABC):
    
    def __init__(self):
        self.results_table = None
        self.score = None
        
    def setup(self, trainset, testset):
        pass
    
    @abstractmethod
    def evaluate_user(self, user, user_recs):
        pass
    
    def evaluate(self, test_recs: TestRecommendations):
        scores = []
        self.results_table = {}
        for user, recs in test_recs.iter_recs():
            score = self.evaluate_user(user, recs)
            scores.append(score)
            self.results_table[user] = score
        self.score = np.mean(scores)
        
class ItemwiseEvaluator(Evaluator):
    
    def __init__(self):
        super().__init__()
    
    def evaluate_user(self, user, user_recs):
        return np.mean([self.evaluate_pred(rec) for rec in user_recs])
        
    @abstractmethod
    def evaluate_pred(self, pred):
        pass
    
class ListwiseEvaluator(Evaluator):
    
    def __init__(self):
        super().__init__()
    
    @abstractmethod
    def evaluate_user(self, user, user_recs):
        pass

In [77]:
class NDCGEvaluator(ListwiseEvaluator):
    
    def __init__(self, k):
        super().__init__()
        self.rated_table = defaultdict(set)
        self.idcg_table = {}
        self.log_table = {}
        self.list_len = k
    
    def setup(self, trainset, testset):
        for entry in testset:
            self.rated_table[entry['user']].add(entry['item'])
        idcg = 0
        for i in range(0, self.list_len+1):
            self.idcg_table[i] = idcg
            rank_utility = 1 / np.log(i+2)
            self.log_table[i] = rank_utility
            idcg += rank_utility
            
    
    def evaluate_user(self, user, user_recs): 
        
        dcg = 0.0
        for i, pred in enumerate(user_recs):
            if pred[1] in self.rated_table[user]:
                dcg = self.log_table[i]
        
        idcg = 0
        if len(self.rated_table[user]) >= self.list_len:
               idcg = self.idcg_table[self.list_len]
        else:
               idcg = self.idcg_table[len(self.rated_table[user])]
            
        if idcg == 0:
            return 0
        return dcg/idcg
                

In [78]:
class PrecisionEvaluator(ItemwiseEvaluator):
    
    def __init__(self):
        super().__init__() # calling parent class
        self.rated_table = defaultdict(set) # table to store the rated items
    
    
    def setup(self,trainset, testset):
        for edge in testset:
                self.rated_table[edge['user']].add(edge['item']) # add the item to the table of the things the user likes
        # now we should have a table from the test set with all the rated items from 
                
    def evaluate_pred(self, pred: Prediction):
        # now we calculate the percision per list per user then we get the average for all the lists
        # number of items that are relevant (in self.rated table) / number of recommended items     
        if pred[1] in self.rated_table[pred[0]]: #check if the item exists in the list
            return 1
        else:
            return 0

In [79]:
random.seed(20231110)

def create_prediction_profiles(test_data, train_items, predict_list_len, frac=1.0):
    train_items_lst = list(train_items) # Can't sample from a set
    user_test_profile = defaultdict(set) # add the test data
    for entry in test_data:
        user_id = entry['user']
        item_id = entry['item']
        user_test_profile[user_id].add(item_id)
    
    test_users = list(user_test_profile.keys()) # sample from the test users
    
    test_users_select = random.sample(test_users, int(frac*len(test_users))) # create a big set and add the test users
                      
    user_predict_profile = {}
    
    for user in test_users_select:
        profile = user_test_profile[user]
        sample_items = list(random.sample(train_items_lst, predict_list_len + len(profile)))
        sample_items = sample_items + list(profile)
        user_predict_profile[user] = sample_items
        
    return user_predict_profile
    
# creating a list of recommendations for a user 
def create_test_recommendations(predict_fn, vectorizer, test_data, list_len, train_items, predict_list_len, frac=1.0):
    user_predict_profile = create_prediction_profiles(test_data, train_items, predict_list_len, frac)
    
    trecs = TestRecommendations()
    
    # for all the usrs and items in the profile
    for user, profile in user_predict_profile.items():
        
        for item in profile:
            x_test = vectorizer.transform({'user': user, 'item': item})
            pred = predict_fn(x_test)[0]
            trecs.add_entry(user, (user, item, pred))
        trecs.select_top(user, list_len)
        
    return trecs # return recommendations list for each user

In [194]:
list_len = 5 # number of recommendations to return
predict_list_len = 100
frac = 0.1

In [195]:
test_recs = create_test_recommendations(fm.predict, v, test_data, list_len, train_items, predict_list_len, frac)
# pred_recs = create_prediction_profiles(test_data, train_items, predict_list_len, frac)

In [196]:
# What does the test_recs look like?
list(test_recs.iter_recs())[:3]

[('ham',
  [('ham', 'axz', 0.5151642703001148),
   ('ham', 'andresmh', 0.4347132074939981),
   ('ham', 'cfiesler', 0.42947047704163904),
   ('ham', 'bkeegan', 0.3471545882487692),
   ('ham', 'drmaxlwilson', 0.19808450787808898)]),
 ('annetropy',
  [('annetropy', 'cfiesler', 0.3993674905964302),
   ('annetropy', 'bkeegan', 0.27413120788861367),
   ('annetropy', 'Niloufar', 0.12225813448914655),
   ('annetropy', 'nbhansen', 0.0617164565985954),
   ('annetropy', 'richmondywong', 0.03113286439285083)]),
 ('chitalyconf',
  [('chitalyconf', 'andresmh', 0.6080064646330265),
   ('chitalyconf', 'andresmh', 0.6080064646330265),
   ('chitalyconf', 'cfiesler', 0.5661052325328114),
   ('chitalyconf', 'msbernst', 0.4847617672063858),
   ('chitalyconf', 'msbernst', 0.4847617672063858)])]

In [197]:
ndcg = NDCGEvaluator(10)

In [198]:
ndcg.setup(train_data, test_data)

In [199]:
ndcg.evaluate(test_recs)

In [200]:
ndcg.score

0.12164267826980209

In [201]:
precision = PrecisionEvaluator()

In [202]:
precision.setup(train_data, test_data)

In [203]:
list(test_recs.iter_recs())[:3]

[('ham',
  [('ham', 'axz', 0.5151642703001148),
   ('ham', 'andresmh', 0.4347132074939981),
   ('ham', 'cfiesler', 0.42947047704163904),
   ('ham', 'bkeegan', 0.3471545882487692),
   ('ham', 'drmaxlwilson', 0.19808450787808898)]),
 ('annetropy',
  [('annetropy', 'cfiesler', 0.3993674905964302),
   ('annetropy', 'bkeegan', 0.27413120788861367),
   ('annetropy', 'Niloufar', 0.12225813448914655),
   ('annetropy', 'nbhansen', 0.0617164565985954),
   ('annetropy', 'richmondywong', 0.03113286439285083)]),
 ('chitalyconf',
  [('chitalyconf', 'andresmh', 0.6080064646330265),
   ('chitalyconf', 'andresmh', 0.6080064646330265),
   ('chitalyconf', 'cfiesler', 0.5661052325328114),
   ('chitalyconf', 'msbernst', 0.4847617672063858),
   ('chitalyconf', 'msbernst', 0.4847617672063858)])]

In [204]:
precision.evaluate(test_recs)

In [205]:
precision.score

0.4919540229885057

### Cross-validatoin

In [208]:
with open("train_test/new_folds/train-0.pkl", "rb") as pfile: 
        lol = pickle.load(pfile)

In [209]:
lol

[('cbuntain', 'stevie'),
 ('cbuntain', 'diyiyang'),
 ('cbuntain', 'joelchan86'),
 ('cbuntain', 'sigchi'),
 ('cbuntain', 'Mor'),
 ('cbuntain', 'axz'),
 ('cbuntain', 'Emrek'),
 ('cbuntain', 'foucaultwelles'),
 ('cbuntain', 'hanlin'),
 ('cbuntain', 'emma_lurie'),
 ('cbuntain', 'gligoric'),
 ('cbuntain', 'manoel'),
 ('cbuntain', 'ndiakopoulos'),
 ('cbuntain', 'cfiesler'),
 ('cbuntain', 'bkeegan'),
 ('priyakalot', 'smunson'),
 ('priyakalot', 'carl'),
 ('priyakalot', 'asarif'),
 ('priyakalot', 'masmart'),
 ('priyakalot', 'dggoldst'),
 ('priyakalot', 'raul'),
 ('priyakalot', 'sunniesuhyoung'),
 ('priyakalot', 'Mor'),
 ('priyakalot', 'kgajos'),
 ('priyakalot', 'ndiakopoulos'),
 ('priyakalot', 'sohw'),
 ('priyakalot', 'Emrek'),
 ('priyakalot', 'emma_lurie'),
 ('priyakalot', 'carolinerpitt'),
 ('priyakalot', 'jbigham'),
 ('priyakalot', 'eglassman'),
 ('priyakalot', 'adam'),
 ('priyakalot', 'zbucinca'),
 ('ak', 'floe'),
 ('ak', 'chiwork'),
 ('ak', 'gruenefeld'),
 ('ak', 'wallace'),
 ('ak', 'Erins

In [143]:
def loadData(file, sample=1.0):
    data = []
    y = []
    users=set()
    items=set()
    for line in file.values:
        (user,item,rating)= line[0],line[1],line[2]
        if random.random() <= sample:
            data.append({ "user": str(user), "item": str(item)})
            y.append(float(rating))
            users.add(user)
            items.add(item)

    return (data, np.array(y), users, items)

In [144]:
def create_train_test_df(train, test, anti_test):  
    train_data_extended = [(follower, following, 1) for follower, following in train]
    test_data_extended = [(follower, following, 1) for follower, following in test]
    anti_test_data_extended = [(follower, following, 0) for follower, following in anti_test]
    
    train_df = pd.DataFrame(
        train_data_extended, columns=["user", "item", "rating"]
    )
    # train_df.to_csv('train_df.csv')
    
    test_data_extended.extend(anti_test_data_extended)
    test_df = pd.DataFrame(test_data_extended, columns=["user", "item", "rating"])
    # test_df.to_csv('test_df.csv')
    
    return train_df, test_df

In [185]:
def extend_data(train, test, train_users):
        
    X_train_data_extended = [{'user': pair[0], 'item': pair[1]} for pair in permutations(list(train_users), 2)]
    
    y_train_extended_list = []

    # This was previously `comb[1]` instead of `comb`. 
    for comb in X_train_data_extended:
        if comb in train:
            y_train_extended_list.append(1)
        else:
            y_train_extended_list.append(0)
            
    y_train_data_extended = np.array(y_train_extended_list, dtype='double')
    
    return X_train_data_extended, y_train_data_extended

In [189]:
hci_ndcg_scores = []
hci_precision_scores = []
k = 5

list_len = 5 # number of recommendations to return
predict_list_len = 100
frac = 0.1

fm = pylibfm.FM (num_factors=10, 
                 num_iter=2, 
                 verbose=True, 
                 task="regression", 
                 initial_learning_rate=0.001, 
                 learning_rate_schedule="optimal")

for fold in ["-0", "-1", "-2", "-3", "-4"]:
    
    print("Starting fold{}".format(fold))
    
    with open("train_test/new_folds/train{}.pkl".format(fold), "rb") as pfile: 
        this_fold_train = pickle.load(pfile)
    
    with open("train_test/new_folds/test{}.pkl".format(fold), "rb") as pfile: 
        this_fold_test = pickle.load(pfile)
        
    with open("train_test/new_folds/anti_test{}.pkl".format(fold), "rb") as pfile: 
        this_fold_anti_test = pickle.load(pfile)
    
    print("done reading the data")
    
    
    #prepare the data
    train_data, test_data = create_train_test_df(this_fold_train, this_fold_test, this_fold_anti_test)

    #load data
    (train_data, y_train, train_users, train_items) = loadData(train_data)
    (test_data, y_test, test_users, test_items) = loadData(test_data)
    
    #extending data
    X_train_data_extended, y_train_data_extended = extend_data(train_data, test_data, train_users)
    
    print("done extending the data")
    
    #vectorize
    v = DictVectorizer()
    X_train = v.fit_transform(X_train_data_extended)
    X_test = v.transform(test_data)
    
    #train the model
    fm.fit(X_train, y_train_data_extended)
    
    preds = fm.predict(X_test)
    test_recs = create_test_recommendations(fm.predict, v, test_data, list_len, train_items, predict_list_len, frac)
    
    print("done making matrix factorization recs")
    
    # --- evaluate ----
    
    # NDCG
    this_fold_ndcg_hci = NDCGEvaluator(k=k)
    this_fold_ndcg_hci.setup(trainset=train_data, testset=test_data)
    this_fold_ndcg_hci.evaluate(test_recs)
    hci_ndcg_scores.append(this_fold_ndcg_hci.score)
    
    # Precision
    this_fold_precision_hci = PrecisionEvaluator()
    this_fold_precision_hci.setup(trainset=train_data, testset=test_data)
    this_fold_precision_hci.evaluate(test_recs)
    hci_precision_scores.append(this_fold_precision_hci.score)
    
    
    print("NDCG: {} \t Precision: {}".format(this_fold_ndcg_hci.score, this_fold_precision_hci.score))

Starting fold-0
done reading the data
[[0.00000e+00 3.28128e+05]
 [1.00000e+00 1.70280e+04]]
done extending the data
Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.02207
-- Epoch 2
Training MSE: 0.02083
done making matrix factorization recs
NDCG: 0.1573651345170828 	 Precision: 0.5758620689655174
Starting fold-1
done reading the data
[[0.00000e+00 3.28128e+05]
 [1.00000e+00 1.70280e+04]]
done extending the data
Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.02205
-- Epoch 2
Training MSE: 0.02080
done making matrix factorization recs
NDCG: 0.17000976711828034 	 Precision: 0.5103448275862068
Starting fold-2
done reading the data
[[0.00000e+00 3.28128e+05]
 [1.00000e+00 1.70280e+04]]
done extending the data
Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.02199
-- Epoch 2
Training MSE: 0.02072
done making matrix factorization

In [192]:
hci_ndcg_scores

[0.1573651345170828,
 0.17000976711828034,
 0.13696784158147554,
 0.15404336751285136,
 0.14869456855448718]

In [193]:
hci_precision_scores

[0.5758620689655174,
 0.5103448275862068,
 0.5344827586206896,
 0.5103448275862069,
 0.5310344827586208]