In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

# !pip3 install pickle5
# import pickle5 as pickle

import pickle
import random

In [26]:
def read_g_obj(file="adj_matrices/G_hci.pkl"):
    with open(file, "rb") as pfile: 
        G = pickle.load(pfile)
    
    follows_at_least_10 = [person for person, out_degree in G.out_degree() if out_degree >= 10] 
    
    subgraph_hci = nx.subgraph(G, follows_at_least_10)
    
    return subgraph_hci

In [27]:
subgraph_hci = read_g_obj()

In [28]:
with open("train_test/test.pkl", "rb") as pfile:
    test = pickle.load(pfile)

In [29]:
with open("train_test/train.pkl", "rb") as pfile:
    train = pickle.load(pfile)

In [30]:
with open("train_test/anti_test.pkl", "rb") as pfile:
    anti_test = pickle.load(pfile)

### Factorization machine

In [31]:
import sys
!{sys.executable} -m pip install git+https://github.com/coreylynch/pyFM

Collecting git+https://github.com/coreylynch/pyFM
  Cloning https://github.com/coreylynch/pyFM to /tmp/pip-req-build-d73aggyz
  Running command git clone --filter=blob:none --quiet https://github.com/coreylynch/pyFM /tmp/pip-req-build-d73aggyz
  Resolved https://github.com/coreylynch/pyFM to commit 0696c980993889a9429e4ab0b6c7dc8be6dac4de
  Preparing metadata (setup.py) ... [?25ldone
[?25h

In [59]:
from sklearn.feature_extraction import DictVectorizer
from surprise import Prediction
from pyfm import pylibfm

#### Loading the data

In [35]:
def loadData(filename,path="", sample=1.0):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        next(f)
        for line in f:
            (index,user,item,rating)=line.split(',')
            if random.random() <= sample:
                data.append({ "user": str(user), "item": str(item)})
                y.append(float(rating))
                users.add(user)
                items.add(item)

    return (data, np.array(y), users, items)

In [36]:
(train_data, y_train, train_users, train_items) = loadData("train_df.csv")

In [37]:
(test_data, y_test, test_users, test_items) = loadData("test_df.csv") #contains both test and anti-test

In [38]:
train_data[:5]

[{'user': 'cqz', 'item': 'jbigham'},
 {'user': 'cqz', 'item': 'ryanatkn'},
 {'user': 'cqz', 'item': 'axz'},
 {'user': 'cqz', 'item': 'msbernst'},
 {'user': 'cqz', 'item': 'qli'}]

In [39]:
test_data[:5]

[{'user': 'cqz', 'item': 'kentrellowens'},
 {'user': 'cqz', 'item': 'ruotongw'},
 {'user': 'cqz', 'item': 'schaferj'},
 {'user': 'Gillian', 'item': 'kgajos'},
 {'user': 'Gillian', 'item': 'andreaforte'}]

In [40]:
from itertools import permutations

#convert the combination_result to sets
X_train_data_extended = [{'user': pair[0], 'item': pair[1]} for pair in permutations(list(train_users), 2)]

In [41]:
len(X_train_data_extended) #this contains all the possible edge options

759512

In [42]:
#check if value in train_data
{'user': 'cqz', 'item': 'jbigham'} in X_train_data_extended

True

In [43]:
y_train_extended_list = []

for comb in X_train_data_extended:
    if comb in train_data:
        y_train_extended_list.append(1)
    else:
        y_train_extended_list.append(0)

In [44]:
len(y_train_extended_list)

759512

In [45]:
print(len(y_train))
print(y_train_extended_list.count(1))

22480
22480


In [46]:
print(len(X_train_data_extended))
print(len(y_train_extended_list))

759512
759512


In [48]:
# convert to an array for pylib.FM
y_train_data_extended = np.array(y_train_extended_list, dtype='double')

#### Prepare the data

In [49]:
v = DictVectorizer()
X_train = v.fit_transform(X_train_data_extended)
X_test = v.transform(test_data)

In [50]:
fm = pylibfm.FM (num_factors=10, 
                 num_iter=10, 
                 verbose=True, 
                 task="regression", 
                 initial_learning_rate=0.001, 
                 learning_rate_schedule="optimal")

fm.fit(X_train, y_train_data_extended)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.01334
-- Epoch 2
Training MSE: 0.01263
-- Epoch 3
Training MSE: 0.01252
-- Epoch 4
Training MSE: 0.01246
-- Epoch 5
Training MSE: 0.01242
-- Epoch 6
Training MSE: 0.01239
-- Epoch 7
Training MSE: 0.01238
-- Epoch 8
Training MSE: 0.01236
-- Epoch 9
Training MSE: 0.01234
-- Epoch 10
Training MSE: 0.01233


In [51]:
preds = fm.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, preds)

0.7990282837690055

In [52]:
# with open('model_1.pkl', 'wb') as file:
#     pickle.dump(model, file)

### Evaluation

In [53]:
from abc import ABC, abstractmethod
from collections import defaultdict

In [54]:
class UserRecommendations:
    
    def __init__(self):
        self.recs = []
        
    def add_entry(self, entry):
        self.recs.append(entry)
        
    def select_top(self, k):
        self.recs = sorted(self.recs, key=lambda entry: entry[2], reverse=True)
        if len(self.recs) > k:
            self.recs = self.recs[0:k]
            
        
class TestRecommendations:
    
    def __init__(self):
        self.test_recs = defaultdict(UserRecommendations)
        
    def setup(self, preds, k):
        for entry in preds:
            user = entry.uid
            self.test_recs[user].add_entry(entry)
                   
        for user in self.test_recs.keys():
            self.test_recs[user].select_top(k)
            
    def add_entry(self, user, entry):
        self.test_recs[user].add_entry(entry)
        
    def select_top(self, user, k):
        self.test_recs[user].select_top(k)
            
    def iter_recs(self):
        for user in self.test_recs.keys():
            yield (user, self.test_recs[user].recs)

In [647]:
class Evaluator(ABC):
    
    def __init__(self):
        self.results_table = None
        self.score = None
        self.m_score = None
        self.f_score = None
        
    def setup(self, trainset, testset):
        pass
    
    @abstractmethod
    def evaluate_user(self, user, user_recs):
        pass
    
    def evaluate(self, test_recs: TestRecommendations):
        scores = []
        self.results_table = {}
        for user, recs in test_recs.iter_recs():
            score = self.evaluate_user(user, recs)
            scores.append(score)
            self.results_table[user] = score
        self.score = np.mean(scores)
    
    
class ItemwiseEvaluator(Evaluator):
    
    def __init__(self):
        super().__init__()
    
    def evaluate_user(self, user, user_recs):
        # print("user=", user)
        # print("user_recs=", user_recs)
        for rec in user_recs:
            print(rec[1])
        return np.mean([self.evaluate_pred(user, rec) for rec in user_recs])
        
    @abstractmethod
    def evaluate_pred(self, pred):
        pass
    
class ListwiseEvaluator(Evaluator):
    
    def __init__(self):
        super().__init__()
    
    @abstractmethod
    def evaluate_user(self, user, user_recs):
        pass

In [648]:
class NDCGEvaluator(ListwiseEvaluator):
    
    def __init__(self, k):
        super().__init__()
        self.rated_table = defaultdict(set)
        self.idcg_table = {}
        self.log_table = {}
        self.list_len = k
        self.users_g_lookup = dict()
    
    # compute idcg
    def setup(self, trainset, testset):
        for entry in testset:
            self.rated_table[entry['user']].add(entry['item'])
        idcg = 0
        for i in range(0, self.list_len+1):
            self.idcg_table[i] = idcg
            rank_utility = 1 / np.log(i+2)
            self.log_table[i] = rank_utility
            idcg += rank_utility

    
    def evaluate_user(self, user, user_recs):  #called by evalueate
        
        dcg = 0.0
        for i, pred in enumerate(user_recs):
            # print("i =", i)
            # print("pred =", pred)
            # print("user_recs =", user_recs)
            if pred[1] in self.rated_table[user]:
                dcg = self.log_table[i]
        
        idcg = 0
        if len(self.rated_table[user]) >= self.list_len:
               idcg = self.idcg_table[self.list_len]
        else:
               idcg = self.idcg_table[len(self.rated_table[user])]
            
        if idcg == 0:
            return 0
        return dcg/idcg
                

In [662]:
class PrecisionEvaluator(ItemwiseEvaluator):
    
    def __init__(self):
        super().__init__()
        self.rated_table = defaultdict(set)
    
    def setup(self, trainset, testset):
        for user, item in testset:
            self.rated_table[user].add(item)
            
    def evaluate_pred(self, user, pred):
        print("user=", user)
        # print("pred=", pred)
        # if user in self.rated_table[pred[1]]:
        if user in pred[1]:
            return 1
        else:
            return 0

In [663]:
import random
random.seed(20231110)

def create_prediction_profiles(test_data, train_items, predict_list_len, frac=1.0):
    train_items_lst = list(train_items) # Can't sample from a set
    user_test_profile = defaultdict(set) # add the test data
    for entry in test_data:
        user_id = entry['user']
        item_id = entry['item']
        user_test_profile[user_id].add(item_id)
    
    test_users = list(user_test_profile.keys()) # sample from the test users
    
    test_users_select = random.sample(test_users, int(frac*len(test_users))) # create a big set and add the test users
                      
    user_predict_profile = {}
    
    for user in test_users_select:
        profile = user_test_profile[user]
        sample_items = list(random.sample(train_items_lst, predict_list_len + len(profile)))
        sample_items = sample_items + list(profile)
        user_predict_profile[user] = sample_items
        
    return user_predict_profile
    
# creating a list of recommendations for a user 
def create_test_recommendations(predict_fn, vectorizer, test_data, list_len, train_items, predict_list_len, frac=1.0):
    user_predict_profile = create_prediction_profiles(test_data, train_items, predict_list_len, frac)
    
    trecs = TestRecommendations()
    
    # for all the usrs and items in the profile
    for user, profile in user_predict_profile.items():
        for item in profile:
            x_test = vectorizer.transform({'user': user, 'item': item})
            pred = predict_fn.predict(x_test)[0]
            trecs.add_entry(user, (user, item, pred))
        trecs.select_top(user, list_len)
        
    return trecs # return recommendations list for each user

In [664]:
list_len = 10 # number of recommendations to return
predict_list_len = 100
frac = 0.1

In [665]:
test_recs = create_test_recommendations(fm, v, test_data, list_len, train_items, predict_list_len, frac)

# pred_recs = create_prediction_profiles(test_data, train_items, predict_list_len, frac)

In [666]:
# What does the test_recs look like?

list(test_recs.iter_recs())[:3]

[('ham',
  [('ham', 'axz', 0.5328377850462805),
   ('ham', 'cfiesler', 0.4559344510313608),
   ('ham', 'andresmh', 0.45477171706024516),
   ('ham', 'floe', 0.2643780180831227),
   ('ham', 'aquigley', 0.19190182459268332),
   ('ham', 'katta', 0.18042272658697311),
   ('ham', 'carolinerpitt', 0.14905280065762852),
   ('ham', 'pedrolopes', 0.14477851242131484),
   ('ham', 'toby', 0.13732397880234798),
   ('ham', 'Regan', 0.12822840781188657)]),
 ('annetropy',
  [('annetropy', 'cfiesler', 0.4289514518681431),
   ('annetropy', 'sigchi', 0.29696864072210133),
   ('annetropy', 'cgruenloh', 0.054614506847177224),
   ('annetropy', 'richmondywong', 0.04801569668343337),
   ('annetropy', 'dj', 0.042662021338407796),
   ('annetropy', 'emax', 0.04220815744080813),
   ('annetropy', 'codingconduct', 0.03646576013833158),
   ('annetropy', 'mporcheron', 0.034458384376109916),
   ('annetropy', 'cabreraalex', 0.02725651520714322),
   ('annetropy', 'Erinsol', 0.023030356179586348)]),
 ('chitalyconf',
  [(

In [667]:
ndcg = NDCGEvaluator(10)

In [668]:
ndcg.setup(train_data, test_data)

In [669]:
ndcg.evaluate(test_recs)

In [670]:
ndcg.score

0.09770620701468878

In [671]:
precision = PrecisionEvaluator()

In [672]:
precision.setup(train_data, test_data)

In [673]:
precision.evaluate(test_recs)

axz
cfiesler
andresmh
floe
aquigley
katta
carolinerpitt
pedrolopes
toby
Regan
user= ham
user= ham
user= ham
user= ham
user= ham
user= ham
user= ham
user= ham
user= ham
user= ham
cfiesler
sigchi
cgruenloh
richmondywong
dj
emax
codingconduct
mporcheron
cabreraalex
Erinsol
user= annetropy
user= annetropy
user= annetropy
user= annetropy
user= annetropy
user= annetropy
user= annetropy
user= annetropy
user= annetropy
user= annetropy
axz
andresmh
msbernst
drmaxlwilson
jordant
Heycori
schaferj
Niloufar
pg
jofish
user= chitalyconf
user= chitalyconf
user= chitalyconf
user= chitalyconf
user= chitalyconf
user= chitalyconf
user= chitalyconf
user= chitalyconf
user= chitalyconf
user= chitalyconf
jbigham
floe
jordant
aquigley
Niloufar
andrewkuznet
jovermeulen
jonfroehlich
skairam
vitak
user= lea
user= lea
user= lea
user= lea
user= lea
user= lea
user= lea
user= lea
user= lea
user= lea
cfiesler
cfiesler
andresmh
eglassman
karger
asb
clifflampe
carolinerpitt
minlee
juchidiuno
user= schoenebeck
user= scho

In [661]:
precision.score

0.0

In [261]:
# list(pred_recs.iter_recs())[:3]

In [None]:
# def check_predictions(predictions_for_group_members, test):
#     """
#     a function for checking (1) how many it got right -- this is precision -- and (2) the next 3 people you should follow
    
#     Params:
#         predictions_for_group_members (this is a dict):
#             a dictionary with our names as keys, and rec list as the values
#             for me, a rec list is (person, score) tuples
#         test (list):
#             an edge list of our test set
    
#     Returns:
#         output_dict (dict):
#             our names as keys, {"predicted" : [list of preds], "correct" : [list of correctly predicted users], 
#             "you should follow" : [list of ppl]}
#     """
    
#     output_dict = {}
    
#     for person, recs in predictions_for_group_members.items():
#         predicted = []
#         correct_prediction = []
#         people_I_should_follow = []
        
        
#         for rec in recs:
#             # my list of recs is a (person, score) tuples
#             predicted.append(rec[0])
            
#             if (person, rec[0]) in test:
#                 correct_prediction.append(rec[0])
#             else:
#                 people_I_should_follow.append(rec[0])
            
        
#         output_dict[person] = dict(predicted=predicted, correct=correct_prediction, should_follow=people_I_should_follow)
    
#     return output_dict

In [148]:
hci_ndcg_scores = []
# hci_precision_scores = []
k = 10

subgraph_hci = read_g_obj(file="adj_matrices/G_hci.pkl")

for fold in ["", "-1", "-2", "-3", "-4"]:
    
    print("Starting fold{}".format(fold))
    
    with open("train_test/train{}.pkl".format(fold), "rb") as pfile: 
        this_fold_train = pickle.load(pfile)
    
    with open("train_test/test{}.pkl".format(fold), "rb") as pfile: 
        this_fold_test = pickle.load(pfile)
    
    # make the graph
    G_train_this_fold = nx.from_edgelist(this_fold_train, create_using=nx.DiGraph)
    
    print("Made graph")
    
    # make sure we did this right
    assert len(G_train_this_fold.edges) > 20000 and len(G_train_this_fold.edges) < 25000
    
    # make my matrix factorization objects
    # this_matrix_factorization_hci = PageRanker(G_train_this_fold)
    list_len = 10 # number of recommendations to return
    predict_list_len = 100
    frac = 0.1
    this_matrix_factorization_hci = create_test_recommendations(fm, v, test_data, list_len, train_items, predict_list_len, frac)
    
    # actually make the lists
    # hci_this_fold_test_recs = this_matrix_factorization_hci
    
    print("done making matrix factorization recs")
    
    # make the test recs
    # this_fold_tr_hci = TestRecommendations(G_train_this_fold)
    # this_fold_tr_hci.setup(hci_this_fold_test_recs, k=k)
    
    # --- evaluate ----
    
    # NDCG
    this_fold_ndcg_hci = NDCGEvaluator(k=k)
    this_fold_ndcg_hci.setup(train_data, test_data)
    
    this_fold_ndcg_hci.evaluate(this_matrix_factorization_hci)
    
    hci_ndcg_scores.append(this_fold_ndcg_hci.score)
    
#     # Precision
#     this_fold_precision_hci = PrecisionEvaluator()
#     this_fold_precision_hci.setup(trainset=None, testset=this_fold_test)
    
#     this_fold_precision_hci.evaluate(this_fold_tr_hci)
    
#     hci_precision_scores.append(this_fold_precision_hci.score)
    
    
    # print("NDCG: {} \t Precision: {}".format(this_fold_ndcg_hci.score, this_fold_precision_hci.score))
    print("NDCG: {}".format(this_fold_ndcg_hci.score))

Starting fold
Made graph
done making matrix factorization recs
NDCG: 0.09399819558092727
Starting fold-1
Made graph
done making matrix factorization recs
NDCG: 0.08834337215922199
Starting fold-2
Made graph
done making matrix factorization recs
NDCG: 0.10144224250927975
Starting fold-3
Made graph
done making matrix factorization recs
NDCG: 0.09279613181856701
Starting fold-4
Made graph
done making matrix factorization recs
NDCG: 0.09939305336761702
