In [1]:
import csv
from collections import namedtuple

anime_data = []
with open("data/processed/processed_animes.csv", encoding="utf8", newline="") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"')
    fields = next(reader)
    AnimeInfo = namedtuple("AnimeInfo", fields)
    for item_id, title, synopsis, genre, score, tokenized_synopsis in reader:
        item_id = int(item_id)
        genre = eval(genre)
        score = float(score)
        tokenized_synopsis = eval(tokenized_synopsis)
        anime_data.append(AnimeInfo(item_id, title, synopsis, genre, score, tokenized_synopsis))

review_data = []
with open("data/processed/processed_reviews.csv", encoding="utf8", newline="") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"')
    fields = next(reader)
    ReviewInfo = namedtuple("ReviewInfo", fields)
    for review_id, user_id, item_id, text, rating, tokenized_text in reader:
        review_id = int(review_id)
        user_id = int(user_id)
        item_id = int(item_id)
        rating = int(rating)
        tokenized_text = eval(tokenized_text)
        review_data.append(ReviewInfo(review_id, user_id, item_id - 1, text, rating, tokenized_text))

In [2]:
print("# of anime:", len(set(a.item_id for a in anime_data)))
print("# of users:", len(set(r.user_id for r in review_data)))

# of anime: 2258
# of users: 1529


In [70]:
user_to_review_idxs = {r.user_id : [] for r in review_data}
for i,r in enumerate(review_data):
    user_to_review_idxs[r.user_id].append(i)

In [71]:
# do hold-one-out for evaluation

import random

train_split = []
val_split = []
test_split = []

for _, reviews in user_to_review_idxs.items():
    test_holdout = random.choice(reviews)
    reviews.remove(test_holdout)
    val_holdout = random.choice(reviews)
    reviews.remove(val_holdout)
    
    test_split.append(test_holdout)
    val_split.append(val_holdout)
    train_split.extend(list(reviews))

print(len(train_split), len(val_split), len(test_split))

32554 1529 1529


In [72]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [91]:
class NCF(nn.Module):
    def __init__(self, add_anime_embedding=False, add_user_embedding=False):
        super(NCF, self).__init__()
        self.add_anime_embedding = add_anime_embedding
        self.add_user_embedding = add_user_embedding
        
        user_emb_dim = 128 # 768
        anime_emb_dim = 128 # 768
        
        if add_anime_embedding:
            self.anime_embedding = nn.Embedding(2258, anime_emb_dim)
        
        if add_user_embedding:
            self.user_embedding = nn.Embedding(1529, user_emb_dim)
        
        self.fc1 = nn.Linear(user_emb_dim + anime_emb_dim, 128)
        self.fc2 = nn.Linear(128, 10)
        self.fc3 = nn.Linear(10, 1)
        self.relu = nn.LeakyReLU(0.05)
        #self.dropout = nn.Dropout(0.2)
    
    def forward(self, user, anime):
        # second dim only present if not adding embeddings
        # user: (batch_size, [user_emb_dim])
        # anime: (batch_size, [anime_emb_dim])
        
        if self.add_anime_embedding:
            anime = self.anime_embedding(anime)
        
        if self.add_user_embedding:
            user = self.user_embedding(user)
        
        x = torch.cat((user, anime), dim=1)
        x = self.fc1(x)
        #x = self.dropout(x)
        x = self.relu(x)
        x = self.fc2(x)
        #x = self.dropout(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

In [95]:
## First try basic NCF

model = NCF(True, True).cuda()
opt = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.3)

training_idxs = train_split

for epoch in range(50):
    print(f"Epoch {epoch+1}")
    
    model.train()
    
    random.shuffle(training_idxs)
    
    total_epoch_loss = 0
    minibatch_count = 0
    
    for b_idx in range(0, 64*(len(training_idxs)//64), 64):
        opt.zero_grad()
        users = torch.LongTensor([review_data[training_idxs[i]].user_id for i in range(b_idx, b_idx + 64)]).cuda()
        anime = torch.LongTensor([review_data[training_idxs[i]].item_id for i in range(b_idx, b_idx + 64)]).cuda()
        ratings = torch.FloatTensor([review_data[training_idxs[i]].rating for i in range(b_idx, b_idx + 64)]).cuda()
        
        pred = model(users, anime)[:,0]
        loss = ((ratings - pred)**2).mean()
        loss.backward()
        opt.step()
        
        total_epoch_loss += loss.item()
        minibatch_count += 1
    
    with torch.no_grad():
        model.eval()
        test_users = torch.LongTensor([review_data[idx].user_id for idx in test_split]).cuda()
        test_anime = torch.LongTensor([review_data[idx].item_id for idx in test_split]).cuda()
        test_ratings = torch.FloatTensor([review_data[idx].rating for idx in test_split]).cuda()
        pred = model(test_users, test_anime)
        test_loss = ((test_ratings - pred) ** 2).mean().item()
        
    print(f"train loss {total_epoch_loss/minibatch_count:.2f}")
    print(f"test loss {test_loss:.2f}")

Epoch 1
train loss 9.84
test loss 5.57
Epoch 2
train loss 4.78
test loss 5.16
Epoch 3
train loss 4.71
test loss 4.97
Epoch 4
train loss 4.57
test loss 4.85
Epoch 5
train loss 4.49
test loss 4.75
Epoch 6
train loss 4.46
test loss 4.76
Epoch 7
train loss 4.46
test loss 4.62
Epoch 8
train loss 4.53
test loss 4.61
Epoch 9
train loss 4.62
test loss 4.59
Epoch 10
train loss 4.70
test loss 4.57
Epoch 11
train loss 4.74
test loss 4.71
Epoch 12
train loss 4.75
test loss 4.62
Epoch 13
train loss 4.76
test loss 4.62
Epoch 14
train loss 4.76
test loss 4.56
Epoch 15
train loss 4.76
test loss 4.61
Epoch 16
train loss 4.76
test loss 4.67
Epoch 17
train loss 4.76
test loss 4.57
Epoch 18
train loss 4.75
test loss 4.58
Epoch 19
train loss 4.76
test loss 4.59
Epoch 20
train loss 4.75
test loss 4.61
Epoch 21
train loss 4.75
test loss 4.65
Epoch 22
train loss 4.76
test loss 4.67
Epoch 23
train loss 4.75
test loss 4.56
Epoch 24
train loss 4.75
test loss 4.69
Epoch 25
train loss 4.76
test loss 4.63
Epoch 26


In [57]:
model(torch.LongTensor([523, 522]).cuda(), torch.LongTensor([1223, 1222]).cuda())

tensor([[6.3063],
        [4.3376]], device='cuda:0', grad_fn=<AddmmBackward>)

In [26]:
user_totals = {r.user_id : [0, 0] for r in review_data}

for i in train_split:
    user_totals[review_data[i].user_id][0] += review_data[i].rating
    user_totals[review_data[i].user_id][1] += 1

user_avgs = {k : (a / b) for k,(a,b) in user_totals.items()}

r = 0

for j in test_split:
    pred = user_avgs[review_data[j].user_id]
    r += (pred - review_data[j].rating)**2

print(r / len(test_split))

3.713802968624789


In [27]:
anime_avgs = {a.item_id : a.score for a in anime_data}

r = 0
for j in test_split:
    pred = anime_avgs[review_data[j].item_id]
    r += (pred - review_data[j].rating)**2

print(r / len(test_split))

4.242937017658601


In [96]:
counts = {r.item_id : 0 for r in review_data}
for r in review_data:
    counts[r.item_id] += 1
counts

{0: 6,
 1: 94,
 2: 23,
 3: 19,
 4: 73,
 5: 116,
 6: 120,
 7: 13,
 8: 9,
 9: 68,
 10: 17,
 11: 16,
 12: 17,
 13: 4,
 14: 5,
 15: 15,
 16: 10,
 17: 18,
 18: 8,
 19: 7,
 20: 7,
 21: 25,
 22: 56,
 23: 27,
 24: 37,
 25: 40,
 26: 4,
 27: 37,
 28: 92,
 29: 22,
 30: 10,
 31: 15,
 32: 7,
 33: 5,
 34: 6,
 35: 10,
 36: 15,
 37: 53,
 38: 20,
 39: 7,
 40: 2,
 41: 4,
 42: 2,
 43: 37,
 44: 30,
 45: 5,
 46: 11,
 47: 10,
 48: 1,
 49: 2,
 50: 47,
 51: 43,
 52: 129,
 53: 67,
 54: 46,
 55: 17,
 56: 34,
 57: 6,
 58: 24,
 59: 28,
 60: 18,
 61: 15,
 62: 7,
 63: 6,
 64: 88,
 65: 71,
 66: 18,
 67: 18,
 68: 14,
 69: 13,
 70: 3,
 71: 59,
 72: 20,
 73: 14,
 74: 18,
 75: 2,
 76: 6,
 77: 2,
 78: 3,
 79: 7,
 80: 64,
 81: 16,
 82: 18,
 83: 15,
 84: 4,
 85: 5,
 86: 47,
 87: 7,
 88: 7,
 89: 7,
 90: 10,
 91: 4,
 92: 25,
 93: 16,
 94: 3,
 95: 2,
 96: 5,
 97: 12,
 98: 11,
 99: 34,
 100: 72,
 101: 5,
 102: 5,
 103: 21,
 104: 37,
 105: 14,
 106: 7,
 107: 3,
 108: 8,
 109: 8,
 110: 56,
 111: 12,
 112: 14,
 113: 39,
 114: 44,

In [100]:
sum(1 for x,v in counts.items() if v <= 1)

51