# Running factorization machine

### We're likely running an FM instead of a matrix factorization

In [229]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

import pickle
import random

import sys
sys.path.append('/FactorizationMachine_modules')

from FactorizationMachine_modules import eval_utils_fact

### These are older files and will revisit

In [230]:
with open("train_test/test.pkl", "rb") as pfile:
    test = pickle.load(pfile)

In [231]:
with open("train_test/train.pkl", "rb") as pfile:
    train = pickle.load(pfile)

In [232]:
with open("train_test/anti_test.pkl", "rb") as pfile:
    anti_test = pickle.load(pfile)

### Factorization machine

In [233]:
import sys
!{sys.executable} -m pip install git+https://github.com/coreylynch/pyFM

Collecting git+https://github.com/coreylynch/pyFM
  Cloning https://github.com/coreylynch/pyFM to /tmp/pip-req-build-sa4cxitt
  Running command git clone --filter=blob:none --quiet https://github.com/coreylynch/pyFM /tmp/pip-req-build-sa4cxitt
  Resolved https://github.com/coreylynch/pyFM to commit 0696c980993889a9429e4ab0b6c7dc8be6dac4de
  Preparing metadata (setup.py) ... [?25ldone
[?25h

In [234]:
# from sklearn.feature_extraction import DictVectorizer
# from pyfm import pylibfm

#### Loading the data

In [238]:
def loadData(filename,path="", sample=1.0):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        next(f)
        for line in f:
            (index,user,item,rating)=line.split(',')
            if random.random() <= sample:
                data.append({ "user": str(user), "item": str(item)})
                y.append(float(rating))
                users.add(user)
                items.add(item)

    return (data, np.array(y), users, items)

In [239]:
(train_data, y_train, train_users, train_items) = loadData("train_test/train_df.csv")

In [241]:
(test_data, y_test, test_users, test_items) = loadData("train_test/test_df.csv") #contains both test and anti-test

In [242]:
train_data[:5]

[{'user': 'cqz', 'item': 'jbigham'},
 {'user': 'cqz', 'item': 'ryanatkn'},
 {'user': 'cqz', 'item': 'axz'},
 {'user': 'cqz', 'item': 'msbernst'},
 {'user': 'cqz', 'item': 'qli'}]

In [243]:
test_data[:5]

[{'user': 'cqz', 'item': 'kentrellowens'},
 {'user': 'cqz', 'item': 'ruotongw'},
 {'user': 'cqz', 'item': 'schaferj'},
 {'user': 'Gillian', 'item': 'kgajos'},
 {'user': 'Gillian', 'item': 'andreaforte'}]

### This is needed to collect the data in fm format

This is constructing a full matrix where our rows are our users and the columns is every single pair of their connections. 
List of dictionaries that are user (person) and the items (every person that they could follow)

In [244]:
# TODO: Self-edges? Is this something we are addressing. We shouldn't have MattNicholson --> MattNicholson
from itertools import permutations

#convert the combination_result to sets
X_train_data_extended = [{'user': pair[0], 'item': pair[1]} for pair in permutations(list(train_users), 2)]
# X_train_data_extended_tuples = [(u, v) for u, v in permutations(train_users, 2)]


In [245]:
len(X_train_data_extended) #this contains all the possible edge options

759512

In [246]:
#check if value in train_data
{'user': 'cqz', 'item': 'jbigham'} in X_train_data_extended

True

### Produce our training examples 
These are all 1's or 0's and it is the length of all permutations in our dataset.

In [16]:
y_train_extended_list = []

for comb in X_train_data_extended:
    if comb in train_data:
        y_train_extended_list.append(1)
    else:
        y_train_extended_list.append(0)

In [187]:
len(y_train_extended_list)

759512

In [188]:
# We are populating our `y_train_extended_list` with both 1s and 0s

unique, counts = np.unique(y_train_extended_list, return_counts=True)
print(np.asarray((unique, counts)).T)

[[     0 737032]
 [     1  22480]]


In [189]:
print(len(y_train))
print(y_train_extended_list.count(1))

22480
22480


In [190]:
print(len(X_train_data_extended))
print(len(y_train_extended_list))

759512
759512


In [191]:
# convert to an array for pylib.FM
y_train_data_extended = np.array(y_train_extended_list, dtype='double')

In [192]:
with open('train_test/y_train_extended_list.pkl', 'wb') as f:
    pickle.dump(y_train_extended_list, f)
    
with open('train_test/X_train_data_extended.pkl', 'wb') as f:
    pickle.dump(X_train_data_extended, f)

#### Prepare the data

In [193]:
v = DictVectorizer()
X_train = v.fit_transform(X_train_data_extended)
X_test = v.transform(test_data)

In [194]:
fm = pylibfm.FM (num_factors=10, 
                 num_iter=5, 
                 verbose=True, 
                 task="regression", 
                 initial_learning_rate=0.001, 
                 learning_rate_schedule="optimal")

fm.fit(X_train, y_train_data_extended)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.01335
-- Epoch 2
Training MSE: 0.01264
-- Epoch 3
Training MSE: 0.01252
-- Epoch 4
Training MSE: 0.01247
-- Epoch 5
Training MSE: 0.01243


In [195]:
preds = fm.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, preds)

0.37498284809685056

### Evaluation

In [196]:
import sys
!{sys.executable} -m pip install surprise



In [197]:
from abc import ABC, abstractmethod
from collections import defaultdict
from surprise import SVD, Reader, Dataset, Prediction

In [203]:
list_len = 5 # number of recommendations to return
predict_list_len = 100
frac = .1

In [204]:
test_recs = eval_utils_fact.create_test_recommendations(fm.predict, v, test_data, list_len, train_items, predict_list_len, frac)

In [205]:
# What does the test_recs look like?
list(test_recs.iter_recs())[:3]

[('shamsi',
  [('shamsi', 'cfiesler', 0.4727124816512766),
   ('shamsi', 'sigchi', 0.4303673166166387),
   ('shamsi', 'bkeegan', 0.39614376614411334),
   ('shamsi', 'Heycori', 0.2297964418145225),
   ('shamsi', 'asb', 0.22318965331158053)]),
 ('karthik',
  [('karthik', 'sigchi', 0.3375504959488091),
   ('karthik', 'drmaxlwilson', 0.2211408876784548),
   ('karthik', 'barik', 0.1881697207880393),
   ('karthik', 'Heycori', 0.13247840525846277),
   ('karthik', 'asb', 0.11952407530533524)]),
 ('panciera',
  [('panciera', 'jbigham', 0.5338622942015655),
   ('panciera', 'cfiesler', 0.4419223970472083),
   ('panciera', 'andresmh', 0.4314979784328827),
   ('panciera', 'msbernst', 0.3453838369952485),
   ('panciera', 'sigchi', 0.34097296224737034)])]

In [206]:
ndcg = NDCGEvaluator(10)

In [207]:
ndcg.setup(train_data, test_data)

In [208]:
ndcg.evaluate(test_recs)

In [209]:
ndcg.score

0.11497133369753697

In [210]:
precision = PrecisionEvaluator()

In [211]:
precision.setup(train_data, test_data)

In [212]:
list(test_recs.iter_recs())[:3]

[('shamsi',
  [('shamsi', 'cfiesler', 0.4727124816512766),
   ('shamsi', 'sigchi', 0.4303673166166387),
   ('shamsi', 'bkeegan', 0.39614376614411334),
   ('shamsi', 'Heycori', 0.2297964418145225),
   ('shamsi', 'asb', 0.22318965331158053)]),
 ('karthik',
  [('karthik', 'sigchi', 0.3375504959488091),
   ('karthik', 'drmaxlwilson', 0.2211408876784548),
   ('karthik', 'barik', 0.1881697207880393),
   ('karthik', 'Heycori', 0.13247840525846277),
   ('karthik', 'asb', 0.11952407530533524)]),
 ('panciera',
  [('panciera', 'jbigham', 0.5338622942015655),
   ('panciera', 'cfiesler', 0.4419223970472083),
   ('panciera', 'andresmh', 0.4314979784328827),
   ('panciera', 'msbernst', 0.3453838369952485),
   ('panciera', 'sigchi', 0.34097296224737034)])]

In [213]:
precision.evaluate(test_recs)

In [214]:
precision.score

0.48505747126436793

### Cross-validatoin

In [247]:
hci_ndcg_scores = []
hci_precision_scores = []
k = 5

list_len = 5 # number of recommendations to return
predict_list_len = 5
frac = 1

fm = pylibfm.FM (num_factors=10, 
                 num_iter=5, 
                 verbose=True, 
                 task="regression", 
                 initial_learning_rate=0.001, 
                 learning_rate_schedule="optimal")

for fold in ["-0", "-1", "-2", "-3", "-4"]:
    
    print("Starting fold{}".format(fold))
    
    with open("train_test/new_folds/train{}.pkl".format(fold), "rb") as pfile: 
        this_fold_train = pickle.load(pfile)
    
    with open("train_test/new_folds/test{}.pkl".format(fold), "rb") as pfile: 
        this_fold_test = pickle.load(pfile)
        
    with open("train_test/new_folds/anti_test{}.pkl".format(fold), "rb") as pfile: 
        this_fold_anti_test = pickle.load(pfile)
    
    print("done reading the data")
    
    
    #prepare the data
    train_data, test_data = eval_utils_fact.create_train_test_df(this_fold_train, this_fold_test, this_fold_anti_test)

    #load data
    (train_data, y_train, train_users, train_items) = eval_utils_fact.loadData(train_data)
    (test_data, y_test, test_users, test_items) = eval_utils_fact.loadData(test_data)
    
    #extending data
    X_train_data_extended, y_train_data_extended = eval_utils_fact.extend_data(train_data, test_data, train_users)
    
    print("done extending the data")
    
    #vectorize
    v = DictVectorizer()
    X_train = v.fit_transform(X_train_data_extended)
    X_test = v.transform(test_data)
    
    #train the model
    fm.fit(X_train, y_train_data_extended)
    
    preds = fm.predict(X_test)
    test_recs = eval_utils_fact.create_test_recommendations(fm.predict, v, test_data, list_len, train_items, predict_list_len, frac)
    
    print("done making matrix factorization recs")
    
    # --- evaluate ----
    
    # NDCG
    this_fold_ndcg_hci = NDCGEvaluator(k=k)
    this_fold_ndcg_hci.setup(trainset=train_data, testset=test_data)
    this_fold_ndcg_hci.evaluate(test_recs)
    hci_ndcg_scores.append(this_fold_ndcg_hci.score)
    
    # Precision
    this_fold_precision_hci = PrecisionEvaluator()
    this_fold_precision_hci.setup(trainset=train_data, testset=test_data)
    this_fold_precision_hci.evaluate(test_recs)
    hci_precision_scores.append(this_fold_precision_hci.score)
    
    
    print("NDCG: {} \t Precision: {}".format(this_fold_ndcg_hci.score, this_fold_precision_hci.score))

Starting fold-0
done reading the data
done extending the data
Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.02204
-- Epoch 2
Training MSE: 0.02080
-- Epoch 3
Training MSE: 0.02058
-- Epoch 4
Training MSE: 0.02048
-- Epoch 5
Training MSE: 0.02043
done making matrix factorization recs
NDCG: 0.13955004586180644 	 Precision: 0.7534013605442177
Starting fold-1
done reading the data
done extending the data
Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.02204
-- Epoch 2
Training MSE: 0.02080
-- Epoch 3
Training MSE: 0.02057
-- Epoch 4
Training MSE: 0.02048
-- Epoch 5
Training MSE: 0.02043
done making matrix factorization recs
NDCG: 0.14015018989515615 	 Precision: 0.7387755102040816
Starting fold-2
done reading the data
done extending the data
Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.02200
-- Epoch 2
Training MSE: 0.0207

In [248]:
hci_ndcg_scores

[0.13955004586180644,
 0.14015018989515615,
 0.1399990318175022,
 0.14004698291633721,
 0.14271369316591886]

In [249]:
hci_precision_scores

[0.7534013605442177,
 0.7387755102040816,
 0.7418367346938775,
 0.7608843537414965,
 0.7401360544217687]