<a href="https://colab.research.google.com/github/stebechoi/CP2/blob/YJ/FM%E1%84%87%E1%85%A7%E1%86%AB%E1%84%89%E1%85%AE%E1%84%8E%E1%85%AE%E1%84%80%E1%85%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_path = '/content/drive/MyDrive/CP2/ml-100k/'
r_cols =['userId','movieId','rating','timestamp']
ratings = pd.read_csv(data_path + 'u.data', sep='\t',names = r_cols, encoding ='latin-1')

item_df = pd.read_csv(data_path + 'u.item', sep='|', encoding='latin-1', header=None,
                        names=['movieId', 'movie_title', 'release_date', 'video_release_date',
                               'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                               'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama',
                               'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                               'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

users_df = pd.read_csv(data_path + 'u.user', sep='|', names = ['userId', 'age', 'gender', 'occupation', 'zip_code'])
users_df = users_df.drop(['zip_code','age','occupation'],axis=1)
item_df = item_df.drop(['movie_title','release_date', 'video_release_date', 'IMDb_URL'], axis=1)
ratings = ratings.drop('timestamp',axis=1)
# item_df = item_df.iloc[:,:2]

In [4]:
#user encoding
user_dict = {}
for i in set(ratings['userId']):
  user_dict[i] = len(user_dict)
n_user = len(user_dict)

#item encoding
item_dict = {}
start_point = n_user
for i in set(ratings['movieId']):
    item_dict[i] = start_point + len(item_dict)
n_item = len(item_dict)
start_point += n_item




#gender
gender_dict = {}
for i in set(users_df['gender']):
  gender_dict[i] = start_point + len(gender_dict)
n_gender = len(gender_dict)
start_point += n_gender

#genre
genre_dict = {}
genre = ['unknown', 'Action', 'Adventure', 'Animation',
         'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama',
         'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
         'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
for i in genre:
  genre_dict[i] = start_point + len(genre_dict)
n_genre = len(genre_dict)
start_point += n_genre
num_x = start_point

x = pd.merge(ratings, item_df, how='outer', on='movieId')
x = pd.merge(x, users_df, how='outer', on='userId')
x = shuffle(x,random_state=1)

#generate x data
#[user_index, movie_index], [user]
data = []
y = []
w0 = np.mean(ratings['rating'])
for i in range(len(x)):
  case = x.iloc[i]
  x_index = []
  x_value = []
  x_index.append(user_dict[case['userId']])
  x_value.append(1)
  x_index.append(item_dict[case['movieId']])
  x_value.append(1)
  x_index.append(gender_dict[case['gender']])
  x_value.append(1)
  for j in genre:
    if case[j] == 1:
      x_index.append(genre_dict[j])
      x_value.append(1)
  data.append([x_index, x_value])
  y.append(case['rating'] - w0)

In [5]:
def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

class FM():
  def __init__(self, N, K, data, y, alpha, beta,n_user,n_item, train_ratio=0.75, iterations=100, tolerance=0.005, l2_reg=True, verbose=True):
    self.K = K    #no of latent factors
    self.N = N    #no of x variables
    self.n_cases = len(data)
    self.alpha = alpha
    self.beta = beta
    self.n_user = n_user
    self.n_item = n_item
    self.iterations = iterations
    self.l2_reg = l2_reg
    self.tolerance = tolerance
    self.verbose = verbose

    # w 초기화  변수의 편향
    self.w = np.random.normal(scale=1./self.N, size=(self.N))
    # v 초기화  잠재요인행렬
    self.v = np.random.normal(scale=1./self.K, size=(self.N, self.K))
    #Train/ Test 분리
    cutoff = int(train_ratio* len(data))
    self.train_x = data[:cutoff]
    self.test_x = data[cutoff:]
    self.train_y = y[:cutoff]
    self.test_y = y[cutoff:]

  #학습 함수
  def test(self):
    best_RMSE = 10000
    best_iteration = 0
    training_process = []  #학습과정을 기록
    for i in range(self.iterations):
      rmse1 = self.sgd(self.train_x, self.train_y)
      rmse2 = self.test_rmse(self.test_x, self.test_y)
      training_process.append((i,rmse1,rmse2))
      if self.verbose:
        if(i+1) % 10 == 0:
          print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
      #기존의 RMSE보다 향상되었으면 새로운 RMSE 와 iteration기록
      if best_RMSE> rmse2:
        best_RMSE = rmse2
        best_iteration = i
      elif(rmse2 - best_RMSE) >self.tolerance:

        break
    print(best_iteration, best_RMSE)
    return training_process
  #sgd 실행
  def sgd(self, x_data,  y_data):
    y_pred = []
    for data, y in zip(x_data, y_data):
      x_idx = data[0]
      x_0 = np.array(data[1])
      x_1 = x_0.reshape(-1,1)

      bias_score = np.sum(self.w[x_idx]*x_0)
      vx = self.v[x_idx]*(x_1)
      sum_vx = np.sum(vx, axis=0)
      sum_vx_2 = np.sum(vx*vx, axis=0)
      latent_score = 0.5*np.sum(np.square(sum_vx) - sum_vx_2)

      y_hat = bias_score + latent_score
      y_pred.append(y_hat)
      error = y - y_hat

      if self.l2_reg:
        self.w[x_idx] += error*self.alpha*(x_0 - self.beta*self.w[x_idx])
        self.v[x_idx] += error*self.alpha*((x_1)*sum(vx) - (vx*x_1) - self.beta*self.v[x_idx])
      else:
        self.w[x_idx] += error*self.alpha*x_0
        self.v[x_idx] += error*self.alpha*((x_1)*sum(vx) - (vx*x_1))
    return RMSE(y_data, y_pred)

  def test_rmse(self, x_data, y_data):
    y_pred =[]
    for data, y in zip(x_data, y_data):
      y_hat = self.predict(data[0], data[1])
      y_pred.append(y_hat)
    return RMSE(y_data, y_pred)

  def full_predict(self,x_data,y_data):
    y_pred=[]
    for data,y in zip(x_data, y_data):
      y_hat = self.predict(data[0], data[1])
      y_pred.append(y_hat)
    return y_pred

  
  def predict(self, idx, x):
    x_0 = np.array(x)
    x_1 = x_0.reshape(-1,1)
    bias_score =  np.sum(self.w[idx]*x_0)

    vx = self.v[idx]*(x_1)
    sum_vx = np.sum(vx, axis=0)
    sum_vx_2 = np.sum(vx*vx,axis=0)
    latent_score = 0.5*np.sum(np.square(sum_vx)-sum_vx_2)

    y_hat = bias_score + latent_score
    return y_hat


In [6]:
K=350
fm1 = FM(num_x, K, data, y, alpha=0.0014, beta=0.075,n_user=n_user,n_item=n_item, train_ratio=0.75, iterations=100, tolerance=0.0005, l2_reg=True, verbose=True)
result = fm1.test()

Iteration: 10 ; Train RMSE = 0.910834 ; Test RMSE = 0.943315
Iteration: 20 ; Train RMSE = 0.882020 ; Test RMSE = 0.939918
Iteration: 30 ; Train RMSE = 0.859613 ; Test RMSE = 0.940279
23 0.9397833270599084


장르 젠더 직업 나이 rmse 0.93  
장르 젠더 rmse 0.939 더올라감
