In [1]:
import pandas as pd
import numpy as np
import argparse
import random
from model import KGCN
from data_loader import DataLoader
import torch
import torch.optim as optim
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# prepare arguments (hyperparameters)
parser = argparse.ArgumentParser()

parser.add_argument('--dataset', type=str, default='movie', help='which dataset to use')
parser.add_argument('--aggregator', type=str, default='sum', help='which aggregator to use')
parser.add_argument('--n_epochs', type=int, default=20, help='the number of epochs')
parser.add_argument('--neighbor_sample_size', type=int, default=8, help='the number of neighbors to be sampled')
parser.add_argument('--dim', type=int, default=16, help='dimension of user and entity embeddings')
parser.add_argument('--n_iter', type=int, default=1, help='number of iterations when computing entity representation')
parser.add_argument('--batch_size', type=int, default=32, help='batch size')
parser.add_argument('--l2_weight', type=float, default=1e-4, help='weight of l2 regularization')
parser.add_argument('--lr', type=float, default=5e-4, help='learning rate')
parser.add_argument('--ratio', type=float, default=0.8, help='size of training dataset')

args = parser.parse_args(['--l2_weight', '1e-4'])

In [4]:
# build dataset and knowledge graph
data_loader = DataLoader(args.dataset)
kg = data_loader.load_kg()
df_dataset = data_loader.load_dataset()
df_dataset

Construct knowledge graph ... Done
Build dataset dataframe ... Done


Unnamed: 0,userID,itemID,label
0,1015,2,1
1,2194,4,1
2,1248,68276,0
3,3486,2,1
4,1864,2,1
...,...,...,...
88547,43,4,1
88548,1504,2,1
88549,2168,3,1
88550,4573,50792,0


In [5]:
# Dataset class
class KGCNDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        user_id = np.array(self.df.iloc[idx]['userID'])
        item_id = np.array(self.df.iloc[idx]['itemID'])
        label = np.array(self.df.iloc[idx]['label'], dtype=np.float32)
        return user_id, item_id, label

In [6]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(df_dataset, df_dataset['label'], test_size=1 - args.ratio, shuffle=True, random_state=999)
train_dataset = KGCNDataset(x_train)
test_dataset = KGCNDataset(x_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size)

In [7]:
# prepare network, loss function, optimizer
num_user, num_entity, num_relation = data_loader.get_num()
user_encoder, entity_encoder, relation_encoder = data_loader.get_encoders()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = KGCN(num_user, num_entity, num_relation, kg, args, device).to(device)
criterion = torch.nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.l2_weight)
print('device: ', device)

device:  cpu


In [11]:
# train
loss_list = []
test_loss_list = []
auc_score_list = []

for epoch in range(args.n_epochs):
    running_loss = 0.0
    for i, (user_ids, item_ids, labels) in enumerate(train_loader):
        user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = net(user_ids, item_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()

        running_loss += loss.item()
    
    # print train loss per every epoch
    print('[Epoch {}]train_loss: '.format(epoch+1), running_loss / len(train_loader))
    loss_list.append(running_loss / len(train_loader))
        
    # evaluate per every epoch
    with torch.no_grad():
        test_loss = 0
        total_roc = 0
        for user_ids, item_ids, labels in test_loader:
            user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)
            outputs = net(user_ids, item_ids)
            test_loss += criterion(outputs, labels).item()
            total_roc += roc_auc_score(labels.cpu().detach().numpy(), outputs.cpu().detach().numpy())
        print('[Epoch {}]test_loss: '.format(epoch+1), test_loss / len(test_loader))
        test_loss_list.append(test_loss / len(test_loader))
        auc_score_list.append(total_roc / len(test_loader))

[Epoch 1]train_loss:  0.5125686307694315
[Epoch 1]test_loss:  0.3432384261908514
[Epoch 2]train_loss:  0.30384114414938096
[Epoch 2]test_loss:  0.2820826152655622
[Epoch 3]train_loss:  0.26631807393874796
[Epoch 3]test_loss:  0.2711692385456192
[Epoch 4]train_loss:  0.25044519835855905
[Epoch 4]test_loss:  0.26230619939225674
[Epoch 5]train_loss:  0.23267110805982066
[Epoch 5]test_loss:  0.24483411821002135
[Epoch 6]train_loss:  0.20891536752732898
[Epoch 6]test_loss:  0.22239821896441148
[Epoch 7]train_loss:  0.18500171523699377
[Epoch 7]test_loss:  0.20101895309742607
[Epoch 8]train_loss:  0.1653116535035306
[Epoch 8]test_loss:  0.18303895584351318
[Epoch 9]train_loss:  0.15000045078772087
[Epoch 9]test_loss:  0.16799944588100868
[Epoch 10]train_loss:  0.1386503142817072
[Epoch 10]test_loss:  0.15696805271753766
[Epoch 11]train_loss:  0.13034712173576385
[Epoch 11]test_loss:  0.1483807197842572
[Epoch 12]train_loss:  0.12386178748232886
[Epoch 12]test_loss:  0.14200975300764349
[Epoc

In [12]:
print(auc_score_list)

[0.8996362600562804, 0.965251654293615, 0.9746923142170515, 0.978140902637488, 0.9848136223693847, 0.9914464552903257, 0.9961339248169335, 0.9981084449020331, 0.9990982287632634, 0.9995400775710478, 0.9996580286589211, 0.9997330048259554, 0.9997417248943208, 0.9997417248943208, 0.9997401376578506, 0.9997385504213808, 0.9997291490976743, 0.9997479517450872, 0.9997479517450872, 0.9997479517450872]


## Compute precision@K

In [15]:
all_user_ids = x_test['userID'].to_list()
all_item_ids = x_test['itemID'].to_list()
all_labels = x_test['label'].to_list()

items_dictionary = {}
mask = []
for i, u in enumerate(all_user_ids):
    items_dictionary[u] = set(np.array(all_item_ids)[np.array(all_user_ids) == u])

item_ids = torch.tensor(all_item_ids)
labels = torch.tensor(all_labels)

k=10

num_hits = 0

for u, i in items_dictionary.items():
    user_ids = torch.tensor([u]*len(all_user_ids))
    predictions = net(user_ids, item_ids)
    _, indices = predictions.topk(k=10, largest=True)
    num_hits += len([x for x in i if x in indices.tolist()])

In [14]:
print(num_hits/len(all_user_ids))

0.10439839647676585


## Build dataset from other KG

In [10]:
entities_index = pd.read_csv("data/kg/entities_index.csv", sep='\t', names=['item', 'id'], header=0)
mappings_df = pd.read_csv("data/kg/mappings.tsv", sep="\t", names=("item_id", "name", "item_uri"), header=0)
kg_df = pd.read_csv("data/kg/kg_no_feedback.csv", sep="::", names=['head', 'rel', 'tail'], header=0)
relations_index = pd.read_csv("data/kg/relations_index.csv", sep='\t', names=['relation', 'relation_id'], header=0)
temp = entities_index.merge(kg_df, left_on="item", right_on="head")
temp = temp.merge(relations_index, left_on="rel", right_on="relation")
temp = temp.merge(entities_index, left_on="tail", right_on="item")
kg_indexed = temp[['id_x', 'relation_id', 'id_y']]

ratings_df = pd.read_csv("data/kg/ratings.csv")
temp = ratings_df.merge(mappings_df, left_on="movieId", right_on="item_id")
temp = temp.merge(entities_index, left_on='item_uri', right_on='item')[['userId', 'rating', 'id']]
# Conversion for comparison.
temp['userId'] = temp['userId'].astype(str)
temp = entities_index.merge(temp, left_on='item', right_on='userId')[['id_x', 'id_y', 'rating']]
temp['label'] = temp['rating'].apply(lambda x: 1 if x >= 4 else 0)
ratings_df = temp[['id_x', 'id_y', 'label']]
ratings_df.columns = ['userID', 'itemID', 'label']

# Reindexing.
from sklearn import preprocessing
le_items = preprocessing.LabelEncoder()
le_items.fit(np.append(np.append(kg_indexed['id_x'].to_numpy(), kg_indexed['id_y'].to_numpy()), ratings_df['itemID'].to_numpy()))
kg_indexed['id_x'] = le_items.transform(kg_indexed['id_x'].to_numpy())
kg_indexed['id_y'] = le_items.transform(kg_indexed['id_y'].to_numpy())
ratings_df['itemID'] = le_items.transform(ratings_df['itemID'].to_numpy())

le_users = preprocessing.LabelEncoder()
le_users.fit(ratings_df['userID'].to_numpy())
ratings_df['userID'] = le_users.transform(ratings_df['userID'].to_numpy())

kg_indexed['relitem'] = list(zip(kg_indexed.relation_id, kg_indexed.id_y))
kg_indexed = kg_indexed[['id_x', 'relitem']]
kg_indexed = kg_indexed.groupby('id_x').agg({'relitem':lambda x: list(x)})
kg_indexed.head()

kg_dict = {}
for row in kg_indexed.itertuples():
    kg_dict[row.Index] = row.relitem
    
kg = kg_dict
df_dataset = ratings_df

  kg_df = pd.read_csv("data/kg/kg_no_feedback.csv", sep="::", names=['head', 'rel', 'tail'], header=0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['itemID'] = le_items.transform(ratings_df['itemID'].to_numpy())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['userID'] = le_users.transform(ratings_df['userID'].to_numpy())
