In [300]:
import csv
from random import random, shuffle
from collections import defaultdict

post_comments = []
posts = set()
with open("/Users/mtrencseni/Downloads/post_comments_1000.csv") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        post_comments.append(row)
        posts.add(row[0])

id_to_title = {}
title_to_id = {}
with open("/Users/mtrencseni/Downloads/top_1000_posts.csv") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    for row in csv_reader:
        id_to_title[row[0]] = row[2]
        title_to_id[row[2]] = row[0]

print('Posts: %d' % len(posts))
users = list(set([row[1] for row in post_comments]))
print('Users: %d' % len(users))
num_vectors = len(posts) + len(users)
print('Total vectors: %d' % num_vectors)
print('Comments: %d' % len(post_comments))
post_lookup = {}
idx = 0
for post in posts:
    post_lookup[post] = idx
    idx += 1
user_lookup = {}
min_user_idx = idx
for user in users:
    user_lookup[user] = idx
    idx += 1
idx_list = [[post_lookup[post], user_lookup[user]] for [post, user] in post_comments]
idx_user_posts = defaultdict(lambda: set())
for [post_idx, user_idx] in idx_list:
    idx_user_posts[user_idx].add(post_idx)

Posts: 989
Users: 17684
Total vectors: 18673
Comments: 170360


In [303]:
import torch

def build_minibatch(num_positives, num_negatives):
    minibatch = []
    for _ in range(num_positives):
        which = int(len(idx_list) * random())
        minibatch.append(idx_list[which] + [1])
    for _ in range(num_negatives):
        while True:
            post = int(len(posts) * random())
            user = min_user_idx + int(len(users) * random())
            if post not in idx_user_posts[user]:
                break
        minibatch.append([post, user] + [-1])
    shuffle(minibatch)
    return minibatch

class Model(torch.nn.Module):
    def __init__(self, num_vectors, embedding_dim):
        super(Model, self).__init__()
        self.embedding = torch.nn.Embedding(num_vectors, embedding_dim, max_norm=1.0)
    def forward(self, input):
        t1 = self.embedding(torch.LongTensor([v[0] for v in input]))
        t2 = self.embedding(torch.LongTensor([v[1] for v in input]))
        dot_products = torch.bmm(
            t1.contiguous().view(len(input), 1, self.embedding.embedding_dim),
            t2.contiguous().view(len(input), self.embedding.embedding_dim, 1)
        )
        return dot_products.contiguous().view(len(input))

embedding_dim = 50
model = Model(num_vectors, embedding_dim)
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.MSELoss(reduction='mean')
num_epochs = 50
num_positives = 500
num_negatives = 500
minibatch_size = num_positives + num_negatives
num_steps_per_epoch = int(len(post_comments) / num_positives)
for i in range(num_epochs):
    for j in range(num_steps_per_epoch):
        optimizer.zero_grad()
        minibatch = build_minibatch(num_positives, num_negatives)
        y = model.forward(minibatch)
        target = torch.FloatTensor([v[2] for v in minibatch])
        loss = loss_function(y, target)
        if i == 0 and j == 0:
            print('r: loss = %.3f' % float(loss))
        loss.backward(retain_graph=True)
        optimizer.step()
    print('%s: loss = %.3f' % (i, float(loss)))

# print out some samples to see how good the fit is
minibatch = build_minibatch(5, 5)
y = model.forward(minibatch)
target = torch.FloatTensor([v[2] for v in minibatch])
for i in range(5+5):
    print('%.3f vs %.3f' % (float(y[i]), float(target[i])))

r: loss = 1.016
0: loss = 1.009
1: loss = 0.967
2: loss = 0.939
3: loss = 0.916
4: loss = 0.891
5: loss = 0.850
6: loss = 0.825
7: loss = 0.791
8: loss = 0.770
9: loss = 0.740
10: loss = 0.701
11: loss = 0.709
12: loss = 0.709
13: loss = 0.701
14: loss = 0.668
15: loss = 0.671
16: loss = 0.670
17: loss = 0.686
18: loss = 0.666
19: loss = 0.668
20: loss = 0.643
21: loss = 0.664
22: loss = 0.644
23: loss = 0.644
24: loss = 0.612
25: loss = 0.624
26: loss = 0.652
27: loss = 0.641
28: loss = 0.655
29: loss = 0.606
30: loss = 0.639
31: loss = 0.642
32: loss = 0.619
33: loss = 0.598
34: loss = 0.618
35: loss = 0.604
36: loss = 0.635
37: loss = 0.639
38: loss = 0.626
39: loss = 0.624
40: loss = 0.630
41: loss = 0.634
42: loss = 0.631
43: loss = 0.617
44: loss = 0.620
45: loss = 0.618
46: loss = 0.602
47: loss = 0.610
48: loss = 0.619
49: loss = 0.633
0.319 vs -1.000
0.395 vs 1.000
0.226 vs -1.000
-0.232 vs -1.000
0.537 vs 1.000
0.179 vs -1.000
-0.020 vs 1.000
0.392 vs 1.000
0.141 vs 1.000
-0.

In [309]:
import numpy as np

def get_post_vector(post_id):
    return model.embedding.weight[post_lookup[post_id]]

def similar_posts_by_title(title):
    post_id = title_to_id[title]
    pv = get_post_vector(post_id)
    dists = []
    for other_post in posts:
        if other_post == post_id: continue
        ov = get_post_vector(other_post)
        dist = torch.dot(pv, ov)
        dists.append([float(dist), 'https://news.ycombinator.com/item?id=' + other_post, id_to_title[other_post]])
    similars = sorted(dists)[-3:]
    similars.reverse()
    return similars

def get_user_vector(user):
    return model.embedding.weight[user_lookup[user]]

def similar_users_by_name(user):
    uv = get_user_vector(user)
    dists = []
    for other_user in users:
        if other_user == user: continue
        ov = get_user_vector(other_user)
        dist = torch.dot(uv, ov)
        dists.append([float(dist), 'https://news.ycombinator.com/user?id=' + other_user, other_user])
    similars = sorted(dists)[-3:]
    similars.reverse()
    return similars  

def posts_recommended_for_user(user):
    uv = get_user_vector(user)
    dists = []
    for post in posts:
        if post_lookup[post] in idx_user_posts[user_lookup[user]]: continue
        pv = get_post_vector(post)
        dist = torch.dot(uv, pv)
        dists.append([float(dist), 'https://news.ycombinator.com/item?id=' + post, id_to_title[post]])
    similars = sorted(dists)[-10:]
    similars.reverse()
    return similars  

test_posts = [
    """Self-driving Uber car kills Arizona woman crossing street""",
    """Ask HN: Who is hiring? (May 2018)""",
    """Conversations with a six-year-old on functional programming""",
    """You probably don't need AI/ML. You can make do with well written SQL scripts""",
    """Bitcoin has little shot at ever being a major global currency""",
    """2018 MacBook Pro Review""",
]

for post_title in test_posts:
    print('Posts similar to: ' + post_title)
    similars = similar_posts_by_title(post_title)
    for s in similars:
        print(s)
    print()

user = 'Maro'
print('Posts recommended for: ' + user)
similars = posts_recommended_for_user(user)
for s in similars:
    print(s)
print()    

user = 'Maro'
print('Users similar to: ' + user)
similars = similar_users_by_name(user)
for s in similars:
    print(s)
print()

Posts similar to: Self-driving Uber car kills Arizona woman crossing street
[0.890119731426239, 'https://news.ycombinator.com/item?id=16643056', 'Tempe Police Release Video of Uber Accident']
[0.6932501792907715, 'https://news.ycombinator.com/item?id=16629673', 'Police Say Video Shows Woman Stepped Suddenly in Front of Self-Driving Uber']
[0.6842777729034424, 'https://news.ycombinator.com/item?id=16761602', 'Tesla crash in September showed similarities to fatal Mountain View accident']

Posts similar to: Ask HN: Who is hiring? (May 2018)
[0.9896665215492249, 'https://news.ycombinator.com/item?id=16735011', 'Ask HN: Who is hiring? (April 2018)']
[0.9825029373168945, 'https://news.ycombinator.com/item?id=17205865', 'Ask HN: Who is hiring? (June 2018)']
[0.9820041656494141, 'https://news.ycombinator.com/item?id=18113144', 'Ask HN: Who is hiring? (October 2018)']

Posts similar to: Conversations with a six-year-old on functional programming
[0.7625867128372192, 'https://news.ycombinator.co