## Yelp dataset in the paper:

The data given in the paper is the yelp data and has following files:
- `item_vector.npy`/`user_vector.npy`: the feature vectors of the user/item from word2vec (using Gensim)
- `yelp.links`: the social graph between the users
- `yelp.test.rating`/`yelp.train.rating`/`yelp.val.rating`: the split dataset for user-item rating (10%/80%/10%)

In [2]:
from collections import defaultdict
import numpy as np

In [31]:
filename = "citation/cit-HepTh.txt"

f = open(filename) 
total_user_list = set()
total_item_list = set()
hash_data = defaultdict(int)

for _, line in enumerate(f):
    arr = line.split("\t")
    hash_data[(int(arr[0]), int(arr[1]))] = 1
    total_user_list.add(int(arr[0]))
    total_item_list.add(int(arr[1]))

total_user_list = list(total_user_list)
total_item_list = list(total_item_list)
hash_data = hash_data

mapping_user_idx = {user:idx for idx,user in enumerate(total_user_list)}
mapping_item_idx = {item:idx for idx,item in enumerate(total_item_list)}

graph = np.zeros((len(total_user_list), len(total_item_list)))
for user, item in hash_data:
    graph[mapping_user_idx[user]][mapping_item_idx[item]] = 1

In [33]:
# Free embedding from the non-negative matrix factorization
import numpy as np
from sklearn.decomposition import NMF

model = NMF(n_components=150, init='nndsvd')
W = model.fit_transform(graph)
H = model.components_



In [40]:
np.save("./citation/user_vector.npy", W)
np.save("./citation/item_vector.npy", H)

In [41]:
graph.shape

(25059, 23180)

In [45]:
filename = "citation/cit-HepTh.txt"

f = open(filename)
f_to = open("citation/citation.links", "a")
total_data = []

for _, line in enumerate(f):
    arr = line.strip().split("\t")
    total_data.append(arr)
    f_to.write(arr[0] + "\t" + arr[1] + "\t" + "1" + "\n")

In [51]:
len(total_data)

352807

In [57]:
import random

random.shuffle(total_data)
train_data = total_data[:282296]
val_data = total_data[282296:317583]
test_data = total_data[317583:]

f_train = open("citation/citation.train.rating", "a")
f_val = open("citation/citation.val.rating", "a")
f_test = open("citation/citation.test.rating", "a")

for _, arr in enumerate(train_data):
    f_train.write(arr[0] + "\t" + arr[1] + "\t" + "1" + "\n")
for _, arr in enumerate(val_data):
    f_val.write(arr[0] + "\t" + arr[1] + "\t" + "1" + "\n")
for _, arr in enumerate(test_data):
    f_test.write(arr[0] + "\t" + arr[1] + "\t" + "1" + "\n")