-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_best.py
98 lines (64 loc) · 2.97 KB
/
main_best.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import numpy as np
import sys
np.random.seed(42)
df = pd.read_csv(sys.argv[1])
targets = pd.read_csv(sys.argv[2])
df[['User', 'Item']] = df['UserId:ItemId'].str.split(':', expand=True)
targets[['User', 'Item']] = targets['UserId:ItemId'].str.split(':', expand=True)
unique_users = df['User'].unique()
unique_items = df['Item'].unique()
user_to_index = {user: idx for idx, user in enumerate(unique_users)}
item_to_index = {item: idx for idx, item in enumerate(unique_items)}
def mini_batch_funkSVD_with_bias(df=df, k=20, learning_rate=0.02, regularization=0.4, epochs=12, batch_size=128):
m, n = len(unique_users), len(unique_items)
# funciona pior
# P = np.random.rand(m, k)
# Q = np.random.rand(n, k)
P = np.random.normal(scale=1./k, size=(m, k))
Q = np.random.normal(scale=1./k, size=(n, k))
# não funciona tão bem
# bu = np.random.randn(m)
# bi = np.random.randn(n)
bu = np.zeros(m)
bi = np.zeros(n)
# global bias
mu = df['Rating'].mean()
ratings = df[['User', 'Item', 'Rating']].values
num_batches = len(ratings) // batch_size + (len(ratings) % batch_size != 0)
for epoch in range(epochs):
for batch_num in range(num_batches):
start = batch_num * batch_size
end = start + batch_size
mini_batch = ratings[start:end]
np.random.shuffle(mini_batch)
for user, item, rating in mini_batch:
i = user_to_index[user]
j = item_to_index[item]
error = rating - (mu + bu[i] + bi[j] + P[i, :].dot(Q[j, :].T))
bu[i] += learning_rate * (error - regularization * bu[i])
bi[j] += learning_rate * (error - regularization * bi[j])
# mais rapido fazer com : do que iterando sobre k
P[i, :] += learning_rate * (error * Q[j, :] - regularization * P[i, :])
Q[j, :] += learning_rate * (error * P[i, :] - regularization * Q[j, :])
learning_rate *= 0.95
regularization *= 0.95
return P, Q, bu, bi, mu
def predict_rating_with_bias(user_id, item_id, P, Q, bu, bi, mu, user_to_index, item_to_index):
user_idx = user_to_index[user_id]
item_idx = item_to_index[item_id]
return mu + bu[user_idx] + bi[item_idx] + P[user_idx, :].dot(Q[item_idx, :].T)
def get_predictions_with_bias(targets, P, Q, bu, bi, mu, user_to_index, item_to_index):
predictions = []
for _, row in targets.iterrows():
user = row['User']
item = row['Item']
predicted = predict_rating_with_bias(user, item, P, Q, bu, bi, mu, user_to_index, item_to_index)
predictions.append(predicted)
return predictions
# calculate pred matrix
a, b, bu, bi, mu = mini_batch_funkSVD_with_bias()
# get preds for the targets df
targets['Rating'] = np.clip(get_predictions_with_bias(targets, a, b, bu, bi, mu, user_to_index, item_to_index), 0, 5)
# write preds
print(targets[['UserId:ItemId', 'Rating']].to_csv(index=False))