In [None]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [None]:
r_cols =['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/CP2/ml-100k/u.data', sep='\t',names = r_cols, encoding ='latin-1')

In [None]:
#user encoding
user_dict = {}
for i in set(ratings['user_id']):
  user_dict[i] = len(user_dict)
n_user = len(user_dict)

#item encoding
item_dict = {}
start_point = n_user
for i in set(ratings['movie_id']):
    item_dict[i] = start_point + len(item_dict)
n_item = len(item_dict)
start_point += n_item
num_x = start_point
ratings = shuffle(ratings, random_state=1)

#generate x data
#[user_index, movie_index], [user]
data = []
y = []
w0 = np.mean(ratings['rating'])
for i in range(len(ratings)):
  case = ratings.iloc[i]
  x_index = []
  x_value = []
  x_index.append(user_dict[case['user_id']])
  x_value.append(1)
  x_index.append(item_dict[case['movie_id']])
  x_value.append(1)
  data.append([x_index, x_value])
  y.append(case['rating'] - w0)
  if (i%10000) ==0:
    print('Encoding', i, 'cases...')

Encoding 0 cases...
Encoding 10000 cases...
Encoding 20000 cases...
Encoding 30000 cases...
Encoding 40000 cases...
Encoding 50000 cases...
Encoding 60000 cases...
Encoding 70000 cases...
Encoding 80000 cases...
Encoding 90000 cases...


In [None]:
#np.array(data[1]).reshape(-1,1)

In [None]:
#w = np.random.normal(scale=1./num_x, size=(num_x))

In [None]:
#type(w)

In [None]:
#cutoff = int(0.75* len(data))
# train_x = data[:cutoff]
# test_x = data[cutoff:]
# train_y = y[:cutoff]
# test_y = y[cutoff:]

In [None]:
# for data, y in zip(train_x, train_y):
#       x_idx = data[0]
#       x_0 = np.array(data[1])
#       x_1 = x_0.reshape(-1,1)

In [None]:
def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

class FM():
  def __init__(self, N, K, data, y, alpha, beta, train_ratio=0.75, iterations=100, tolerance=0.005, l2_reg=True, verbose=True):
    self.K = K    #no of latent factors
    self.N = N    #no of x variables
    self.n_cases = len(data)
    self.alpha = alpha
    self.beta = beta
    self.iterations = iterations
    self.l2_reg = l2_reg
    self.tolerance = tolerance
    self.verbose = verbose

    # w 초기화  변수의 편향
    self.w = np.random.normal(scale=1./self.N, size=(self.N))
    # v 초기화  잠재요인행렬
    self.v = np.random.normal(scale=1./self.K, size=(self.N, self.K))
    #Train/ Test 분리
    cutoff = int(train_ratio* len(data))
    self.train_x = data[:cutoff]
    self.test_x = data[cutoff:]
    self.train_y = y[:cutoff]
    self.test_y = y[cutoff:]

  #학습 함수
  def test(self):
    best_RMSE = 10000
    best_iteration = 0
    training_process = []  #학습과정을 기록
    for i in range(self.iterations):
      rmse1 = self.sgd(self.train_x, self.train_y)
      rmse2 = self.test_rmse(self.test_x, self.test_y)
      training_process.append((i,rmse1,rmse2))
      if self.verbose:
        if(i+1) % 10 == 0:
          print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
      #기존의 RMSE보다 향상되었으면 새로운 RMSE 와 iteration기록
      if best_RMSE> rmse2:
        best_RMSE = rmse2
        best_iteration = i
      elif(rmse2 - best_RMSE) >self.tolerance:

        break
    print(best_iteration, best_RMSE)
    return training_process
  #sgd 실행
  def sgd(self, x_data,  y_data):
    y_pred = []
    for data, y in zip(x_data, y_data):
      x_idx = data[0]
      x_0 = np.array(data[1])
      x_1 = x_0.reshape(-1,1)

      bias_score = np.sum(self.w[x_idx]*x_0)
      vx = self.v[x_idx]*(x_1)
      sum_vx = np.sum(vx, axis=0)
      sum_vx_2 = np.sum(vx*vx, axis=0)
      latent_score = 0.5*np.sum(np.square(sum_vx) - sum_vx_2)

      y_hat = bias_score + latent_score
      y_pred.append(y_hat)
      error = y - y_hat

      if self.l2_reg:
        self.w[x_idx] += error*self.alpha*(x_0 - self.beta*self.w[x_idx])
        self.v[x_idx] += error*self.alpha*((x_1)*sum(vx) - (vx*x_1) - self.beta*self.v[x_idx])
      else:
        self.w[x_idx] += error*self.alpha*x_0
        self.v[x_idx] += error*self.alpha*((x_1)*sum(vx) - (vx*x_1))
    return RMSE(y_data, y_pred)

  def test_rmse(self, x_data, y_data):
    y_pred =[]
    for data, y in zip(x_data, y_data):
      y_hat = self.predict(data[0], data[1])
      y_pred.append(y_hat)
    return RMSE(y_data, y_pred)
  
  def predict(self, idx, x):
    x_0 = np.array(x)
    x_1 = x_0.reshape(-1,1)
    bias_score =  np.sum(self.w[idx]*x_0)

    vx = self.v[idx]*(x_1)
    sum_vx = np.sum(vx, axis=0)
    sum_vx_2 = np.sum(vx*vx,axis=0)
    latent_score = 0.5*np.sum(np.square(sum_vx)-sum_vx_2)

    y_hat = bias_score + latent_score
    return y_hat

K=350
fm1 = FM(num_x, K, data, y, alpha=0.0014, beta=0.075, train_ratio=0.75, iterations=600, tolerance=0.0005, l2_reg=True, verbose=True)
result = fm1.test()






Iteration: 10 ; Train RMSE = 0.958511 ; Test RMSE = 0.965309
Iteration: 20 ; Train RMSE = 0.936597 ; Test RMSE = 0.949987
Iteration: 30 ; Train RMSE = 0.927652 ; Test RMSE = 0.944258
Iteration: 40 ; Train RMSE = 0.922805 ; Test RMSE = 0.941442
Iteration: 50 ; Train RMSE = 0.919652 ; Test RMSE = 0.939835
Iteration: 60 ; Train RMSE = 0.917090 ; Test RMSE = 0.938739
Iteration: 70 ; Train RMSE = 0.914255 ; Test RMSE = 0.937713
Iteration: 80 ; Train RMSE = 0.909994 ; Test RMSE = 0.936253
Iteration: 90 ; Train RMSE = 0.902500 ; Test RMSE = 0.933620
Iteration: 100 ; Train RMSE = 0.889812 ; Test RMSE = 0.929166
Iteration: 110 ; Train RMSE = 0.871740 ; Test RMSE = 0.923433
Iteration: 120 ; Train RMSE = 0.849695 ; Test RMSE = 0.917914
Iteration: 130 ; Train RMSE = 0.823825 ; Test RMSE = 0.913204
Iteration: 140 ; Train RMSE = 0.793240 ; Test RMSE = 0.909307
Iteration: 150 ; Train RMSE = 0.757716 ; Test RMSE = 0.906462
Iteration: 160 ; Train RMSE = 0.718162 ; Test RMSE = 0.905007
Iteration: 170 ; 