## Reading files

In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

import pickle
import random

In [107]:
def read_g_obj(file="adj_matrices/G_hci.pkl"):
    with open(file, "rb") as pfile: 
        G = pickle.load(pfile)
    
    follows_at_least_10 = [person for person, out_degree in G.out_degree() if out_degree >= 10] 
    
    subgraph_hci = nx.subgraph(G, follows_at_least_10)
    
    return subgraph_hci

In [108]:
subgraph_hci = read_g_obj()

In [109]:
with open("train_test/test.pkl", "rb") as pfile:
    test = pickle.load(pfile)

In [110]:
with open("train_test/train.pkl", "rb") as pfile:
    train = pickle.load(pfile)

In [111]:
with open("train_test/anti_test.pkl", "rb") as pfile:
    anti_test = pickle.load(pfile)

In [112]:
# adding 1 to all the user-user edges
def create_train_test_df(train, test, anti_test):  
    train_data_extended = [(follower, following, 1) for follower, following in train]
    test_data_extended = [(follower, following, 1) for follower, following in test]
    anti_test_data_extended = [(follower, following, 0) for follower, following in anti_test]
    
    train_df = pd.DataFrame(
        train_data_extended, columns=["user", "item", "rating"]
    )
    # train_df.to_csv('train_df.csv')
    
    test_data_extended.extend(anti_test_data_extended)
    test_df = pd.DataFrame(test_data_extended, columns=["user", "item", "rating"])
    # test_df.to_csv('test_df.csv')
    
    return train_df, test_df

In [100]:
train_df, test_df = create_train_test_df(train, test, anti_test)

### is there a way to include features???

In [114]:
# users_data = pd.read_csv("data_csv/profiles/extended_df_complete.csv")

In [127]:
test_df[test_df['user'] == 'Anas']

Unnamed: 0,user,item,rating
7337,Anas,axz,1
7338,Anas,evanmpeck,1
7339,Anas,johannagunawan,1
7340,Anas,cfiesler,1
7341,Anas,karger,1
7342,Anas,infinimatt,1
7343,Anas,jbigham,1
16409,Anas,carla,0
16410,Anas,annetropy,0
16411,Anas,ThomasMildner,0


In [66]:
train_df.head()

Unnamed: 0,user,item,rating
0,cqz,jbigham,1
1,cqz,ryanatkn,1
2,cqz,axz,1
3,cqz,msbernst,1
4,cqz,qli,1


In [67]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22480 entries, 0 to 22479
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   user    22480 non-null  object
 1   item    22480 non-null  object
 2   rating  22480 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 527.0+ KB


In [68]:
test_df.head()

Unnamed: 0,user,item,rating
0,cqz,kentrellowens,1
1,cqz,ruotongw,1
2,cqz,schaferj,1
3,Gillian,kgajos,1
4,Gillian,andreaforte,1


In [69]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18144 entries, 0 to 18143
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   user    18144 non-null  object
 1   item    18144 non-null  object
 2   rating  18144 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 425.4+ KB


## Model Implementation

In [70]:
import sys
sys.path.append("/LightGCN")

In [71]:
from LightGCN_modules import LightGCN
from LightGCN_modules import ImplicitCF
from LightGCN_modules import deeprec_utils
from LightGCN_modules import eval_utils

In [72]:
# Model implementation

In [73]:
data = ImplicitCF.ImplicitCF(
    train=train_df,
    test=test_df,
    seed=2023,
    col_user="user",
    col_item="item",
    col_rating="rating",
)

In [74]:
### Setup the model
hparam32 = deeprec_utils.prepare_hparams(
    "LightGCN_modules/lightgcn.yaml",
    n_layers=3,
    batch_size=512,
    epochs=50,
    embed_size=32,
    learning_rate=0.01,
    eval_epoch=5,
    top_k=10,
)

In [75]:
model32 = LightGCN.LightGCN(hparam32, data, seed=2023)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [76]:
model32.fit()

Epoch 1 (train)0.6s: train loss = 0.50526 = (mf)0.50509 + (embed)0.00017
Epoch 2 (train)0.5s: train loss = 0.33958 = (mf)0.33909 + (embed)0.00049
Epoch 3 (train)0.5s: train loss = 0.33267 = (mf)0.33215 + (embed)0.00052
Epoch 4 (train)0.5s: train loss = 0.30751 = (mf)0.30691 + (embed)0.00060
Epoch 5 (train)0.5s + (eval)0.2s: train loss = 0.28195 = (mf)0.28122 + (embed)0.00073, recall = 0.29465, ndcg = 0.36278, precision = 0.21411, map = 0.19721
Epoch 6 (train)0.5s: train loss = 0.25836 = (mf)0.25746 + (embed)0.00090
Epoch 7 (train)0.5s: train loss = 0.24671 = (mf)0.24567 + (embed)0.00104
Epoch 8 (train)0.5s: train loss = 0.23615 = (mf)0.23497 + (embed)0.00118
Epoch 9 (train)1.0s: train loss = 0.23025 = (mf)0.22893 + (embed)0.00131
Epoch 10 (train)0.5s + (eval)0.1s: train loss = 0.21641 = (mf)0.21495 + (embed)0.00146, recall = 0.30826, ndcg = 0.37844, precision = 0.22844, map = 0.20586
Epoch 11 (train)0.5s: train loss = 0.20403 = (mf)0.20240 + (embed)0.00163
Epoch 12 (train)0.5s: train l

In [77]:
topk_scores32 = model32.recommend_k_items(test_df, top_k=10, remove_seen=True)
topk_scores32.head()

Unnamed: 0,user,item,prediction
0,cqz,imjane,7.804013
1,cqz,ruotongw,7.30416
2,cqz,tisjune,6.941763
3,cqz,joonspk,6.544664
4,cqz,andresmh,6.186705


### Evaluation

In [78]:
eval_ndcg32 = eval_utils.ndcg_at_k(test_df, topk_scores32, k=10)
eval_ndcg32

0.3106120875424586

In [79]:
eval_map = eval_utils.map_at_k(test_df, topk_scores32, k=10)
eval_ndcg = eval_utils.ndcg_at_k(test_df, topk_scores32, k=10)
eval_precision = eval_utils.precision_at_k(test_df, topk_scores32, k=10)
eval_recall = eval_utils.recall_at_k(test_df, topk_scores32, k=10)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.102879
NDCG:	0.310612
Precision@K:	0.249541
Recall@K:	0.169446


In [80]:
# wants me to follow myself lol
topk_scores32[topk_scores32['user'] == 'Anas']

Unnamed: 0,user,item,prediction
5880,Anas,Anas,7.0665
5881,Anas,evijitghosh,6.258918
5882,Anas,davidthewid,5.826617
5883,Anas,sukrit,5.373965
5884,Anas,sara,5.261525
5885,Anas,andrewkuznet,5.050897
5886,Anas,samanthadalal,4.810711
5887,Anas,skairam,4.790081
5888,Anas,jr,4.752659
5889,Anas,jordant,4.650952


In [81]:
def check_predictions(predictions_for_group_members, test):
    """
    a function for checking 
        1- how many it got right -- this is precision -- and 
        2- the next 3 people you should follow
    
    Params:
        predictions_for_group_members (this is a dict):
            a dictionary with our names as keys, and rec list as the values
            for me, a rec list is (person, score) tuples
        test (list):
            an edge list of our test set
    
    Returns:
        output_dict (dict):
            our names as keys, {"predicted" : [list of preds], "correct" : [list of correctly predicted users], 
            "you should follow" : [list of ppl]}
    """
    
    output_dict = {}
    
    for person, recs in predictions_for_group_members.items():
        predicted = []
        correct_prediction = []
        people_I_should_follow = []
        
        
        for rec in recs:
            # my list of recs is a (person, score) tuples
            predicted.append(rec[0])
            
            if (person, rec[0]) in test:
                correct_prediction.append(rec[0])
            else:
                people_I_should_follow.append(rec[0])
            
        
        output_dict[person] = dict(predicted=predicted, correct=correct_prediction, should_follow=people_I_should_follow)
    
    return output_dict

In [82]:
# a dictionary with our names as keys, and rec list as the values 
# for me, a rec list is (person, score) tuples

In [83]:
our_recs = dict()
user_accounts = ['Anas', 'MattNicholson', 'joshua_paup']

for user in user_accounts:
    accts = topk_scores32[topk_scores32['user'] == user]['item'].values
    preds = topk_scores32[topk_scores32['user'] == user]['prediction'].values
    our_recs[user] = list(zip(accts, preds))

In [84]:
our_recs

{'Anas': [('Anas', 7.0665),
  ('evijitghosh', 6.258918),
  ('davidthewid', 5.826617),
  ('sukrit', 5.373965),
  ('sara', 5.261525),
  ('andrewkuznet', 5.0508966),
  ('samanthadalal', 4.810711),
  ('skairam', 4.790081),
  ('jr', 4.7526593),
  ('jordant', 4.650952)],
 'MattNicholson': [('msbernst', 8.212967),
  ('Heycori', 7.760834),
  ('MattNicholson', 7.4578395),
  ('skairam', 7.087228),
  ('jbigham', 6.846466),
  ('andresmh', 6.826126),
  ('bkeegan', 6.6153903),
  ('aaroniidx', 6.5041976),
  ('carl', 6.3311195),
  ('emma_lurie', 6.040155)],
 'joshua_paup': [('axz', 9.071303),
  ('jbigham', 7.8865385),
  ('msbernst', 7.7097316),
  ('MattNicholson', 6.5180087),
  ('sigchi', 6.3359466),
  ('Heycori', 6.3343062),
  ('blakeley', 6.3238444),
  ('sara', 6.288517),
  ('princess', 6.0874386),
  ('jordant', 5.904695)]}

In [87]:
test[:5]

[('cqz', 'kentrellowens'),
 ('cqz', 'ruotongw'),
 ('cqz', 'schaferj'),
 ('Gillian', 'kgajos'),
 ('Gillian', 'andreaforte')]

In [89]:
output = check_predictions(our_recs, test)

In [98]:
for user in output.keys():
    print("\nUsername:", user, "\n")
    print("Predicted list:\n",output[user]['predicted'])
    print("Correct predictions:\n",output[user]['correct'])
    print("People to follow:\n",output[user]['should_follow'])
    print("=====")


Username: Anas 

Predicted list:
 ['Anas', 'evijitghosh', 'davidthewid', 'sukrit', 'sara', 'andrewkuznet', 'samanthadalal', 'skairam', 'jr', 'jordant']
Correct predictions:
 []
People to follow:
 ['Anas', 'evijitghosh', 'davidthewid', 'sukrit', 'sara', 'andrewkuznet', 'samanthadalal', 'skairam', 'jr', 'jordant']
=====

Username: MattNicholson 

Predicted list:
 ['msbernst', 'Heycori', 'MattNicholson', 'skairam', 'jbigham', 'andresmh', 'bkeegan', 'aaroniidx', 'carl', 'emma_lurie']
Correct predictions:
 ['msbernst']
People to follow:
 ['Heycori', 'MattNicholson', 'skairam', 'jbigham', 'andresmh', 'bkeegan', 'aaroniidx', 'carl', 'emma_lurie']
=====

Username: joshua_paup 

Predicted list:
 ['axz', 'jbigham', 'msbernst', 'MattNicholson', 'sigchi', 'Heycori', 'blakeley', 'sara', 'princess', 'jordant']
Correct predictions:
 ['axz', 'MattNicholson']
People to follow:
 ['jbigham', 'msbernst', 'sigchi', 'Heycori', 'blakeley', 'sara', 'princess', 'jordant']
=====


### Cross validation for ndcg and pprecision

In [121]:
hci_ndcg_scores = []
hci_precision_scores = []
top_k = 5

hparam32 = deeprec_utils.prepare_hparams(
    "LightGCN_modules/lightgcn.yaml",
    n_layers=3,
    batch_size=512,
    epochs=10,
    embed_size=32,
    learning_rate=0.01,
    eval_epoch=5,
    top_k=top_k,
)

subgraph_hci = read_g_obj(file="adj_matrices/G_hci.pkl")

for fold in ["-0", "-1", "-2", "-3", "-4"]:
    
    print("Starting fold{}".format(fold))
    
    # reading test_train folds
    with open("train_test/new_folds/train{}.pkl".format(fold), "rb") as pfile: 
        this_fold_train = pickle.load(pfile)
    
    with open("train_test/new_folds/test{}.pkl".format(fold), "rb") as pfile: 
        this_fold_test = pickle.load(pfile)
    
    with open("train_test/new_folds/anti_test{}.pkl".format(fold), "rb") as pfile: 
        this_fold_anti_test = pickle.load(pfile)
        
    # prepare the data
    this_train_df, this_test_df = create_train_test_df(this_fold_train, this_fold_test, this_fold_anti_test)
    
    print("done preparing the data")
    
    # train the model
    data = ImplicitCF.ImplicitCF(
        train=this_train_df,
        test=this_test_df,
        seed=2023,
        col_user="user",
        col_item="item",
        col_rating="rating",
    )
    
    
    model32 = LightGCN.LightGCN(hparam32, data, seed=2023)
    model32.fit()
    
    print("done training the model")
    
    # make the test recs
    topk_scores32 = model32.recommend_k_items(this_test_df, top_k=top_k, remove_seen=True)
    
    # --- evaluate ----
    
    # NDCG
    eval_ndcg32 = eval_utils.ndcg_at_k(this_test_df, topk_scores32, k=top_k)
    hci_ndcg_scores.append(eval_ndcg32)
    
    # Precision
    eval_precision = eval_utils.precision_at_k(this_test_df, topk_scores32, k=10)
    hci_precision_scores.append(eval_precision)
    
    print("NDCG: {} \t Precision: {}".format(eval_ndcg32, eval_precision)) 

    
    

Starting fold-0
done preparing the data
Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)0.5s: train loss = 0.59471 = (mf)0.59461 + (embed)0.00010
Epoch 2 (train)0.3s: train loss = 0.41842 = (mf)0.41798 + (embed)0.00045
Epoch 3 (train)0.3s: train loss = 0.40863 = (mf)0.40817 + (embed)0.00046
Epoch 4 (train)0.3s: train loss = 0.39791 = (mf)0.39739 + (embed)0.00051
Epoch 5 (train)0.3s + (eval)0.2s: train loss = 0.36872 = (mf)0.36812 + (embed)0.00060, recall = 0.19415, ndcg = 0.46198, precision = 0.40748, map = 0.16136
Epoch 6 (train)0.3s: train loss = 0.34992 = (mf)0.34918 + (embed)0.00074
Epoch 7 (train)0.3s: train loss = 0.32424 = (mf)0.32334 + (embed)0.00090
Epoch 8 (train)0.3s: train loss = 0.30576 = (mf)0.30468 + (embed)0.00108
Epoch 9 (train)0.3s: train loss = 0.28940 = (mf)0.28814 + (embed)0.00126
Epoch 10 (train)0.3s + (eval)0.1s: train loss = 0.27894 = (mf)0.27752 + (embed)0.00143, recall = 0.19835, ndcg = 0.46818, 

In [122]:
hci_ndcg_scores

[0.4791227357526531,
 0.4659676800231786,
 0.4612619375453949,
 0.4745345567961225,
 0.4839142710186345]

In [123]:
hci_precision_scores

[0.21649659863945578,
 0.21037414965986398,
 0.2078231292517007,
 0.2151360544217687,
 0.2193877551020408]