<a href="https://colab.research.google.com/github/minshyee/RecoSyS/blob/main/Hybrid_RecomSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 하이브리드 추천시스템의 원리

In [1]:
from sklearn.model_selection import train_test_split
import random
import numpy as np
import pandas as pd

In [None]:
r_cols = ['user_id','movie_id','rating','timestamp']
data_src = '/content/drive/MyDrive/Recosys/Data/u.data'
ratings = pd.read_csv(data_src, 
                      names=r_cols,
                      sep='\t',
                      encoding='latin-1')

ratings_train, ratings_test = train_test_split(ratings,
                                               test_size=0.2,
                                               shuffle=True,
                                               random_state=2021)

def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

### dummy recosys

In [None]:
def recommender_1(recom_list):
  recommandations = []
  for pair in recom_list:
    recommandations.append(random.random() * 4 + 1) # 1-5 사이의 난수(임의의) 평점
  return np.array(recommandations)

def recommender_2(recom_list):
  recommandations = []
  for pair in recom_list:
    recommandations.append(random.random() * 4 + 1) # 1-5 사이의 난수(임의의) 평점
  return np.array(recommandations)

### Hybrid

In [None]:
weight = [0.8, 0.2]
recom_list = np.array(ratings_test)
predictions_1 = recommender_1(recom_list)
predictions_2 = recommender_2(recom_list)

predictions = predictions_1 * weight[0] + predictions_2 * weight[1]
RMSE(recom_list[:,2], predictions)

1.5590157722579836

## CF 와 MF의 결합 추천 시스템

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
class NEW_MF():
  def __init__(self, ratings, hyper_params):
    self.R = np.array(ratings)
    self.num_users, self.num_items = np.shape(self.R)

    self.K = hyper_params['K']
    self.alpha = hyper_params['alpha']
    self.beta = hyper_params['beta']
    self.iterations = hyper_params['iterations']
    self.verbose = hyper_params['verbose']

    # index 재정렬 + 맵핑
    item_id_index = []
    index_item_id = []
    for i, one_id in enumerate(ratings):
      item_id_index.append([one_id, i])
      index_item_id.append([i, one_id])
    self.item_id_index = dict(item_id_index)
    self.index_item_id = dict(index_item_id)

    user_id_index = []
    index_user_id = []
    for i, one_id in enumerate(ratings.T):
      user_id_index.append([one_id, i])
      index_user_id.append([i, one_id])
    self.user_id_index = dict(user_id_index)
    self.index_user_id = dict(index_user_id)

  def rmse(self):
    xs, ys = self.R.nonzero() # 좌표값 받기
    self.predictions = []
    self.errors = []
    for x,y in zip(xs,ys):
      prediction = self.get_predict(x,y)
      self.predictions.append(prediction)
      self.errors.append(self.R[x,y] - prediction)
    self.predictions = np.array(self.predictions)
    self.errors = np.array(self.errors)
    return np.sqrt(np.mean(self.errors**2))

  def sgd(self):
    for i,j,r in self.samples: # x,y = (i,j) | 평가데이터 r
      prediction = self.get_predict(i,j)
      e = (r - prediction)

      self.b_u[i] += self.alpha * (e - (self.beta * self.b_u[i]))
      self.b_d[j] += self.alpha * (e - (self.beta * self.b_d[j]))

      self.P[i,:] += self.alpha * ((e * self.Q[j,:]) - (self.beta * self.P[i,:]))
      self.Q[j,:] += self.alpha * ((e * self.P[i,:]) - (self.beta * self.Q[j,:]))

  def get_predict(self,i,j):
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i,:].dot(self.Q[j,:].T)
    return prediction

  def set_test(self,ratings_test):
    test_set = []
    for i in range(len(ratings_test)):
      x = self.user_id_index[ratings_test.iloc[i,0]]
      y = self.item_id_index[ratings_test.iloc[i,1]]
      z = ratings_test.iloc[i,2]
      test_set.append([x,y,z])
      self.R[x,y] = 0
    self.test_set = test_set
    return test_set

  def test_rmse(self):
    error = 0
    for one_set in self.test_set:
      predicted = self.get_predict(one_set[0], one_set[1])
      error += pow(one_set[2] - predicted, 2)
    return np.sqrt(error/len(self.test_set))

  def test(self):
    self.P = np.random.normal(scale=1./self.K,
                              size=(self.num_users,self.K))
    self.Q = np.random.normal(scale=1./self.K,
                              size=(self.num_items,self.K))
    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows, columns = self.R.nonzero()
    self.samples = [(i,j,self.R[i,j]) for i, j in zip(rows, columns)]

    training_process = []
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse_train = self.rmse()
      rmse_test = self.test_rmse()
      training_process.append((i+1, rmse_train, rmse_test))
      if self.verbose:
        if (i+1) % 10 == 0:
          print(f"Iteration : {i+1} | Train RMSE : {rmse_train} | Test RMSE : {rmse_test}")
    
    return training_process

  def get_one_predict(self,user_id,item_id):
    return self.get_predict(self.user_id_index[user_id],
                            self.item_id_index[item_id])
    
  def full_predict(self):
    return self.b + self.b_u[:, np.newaxis] + self.b_d[np.newaxis, :] + self.P.dot(self.Q.T)

In [5]:
base_src = '/content/drive/MyDrive/Recosys/Data'
u_data_src = os.path.join(base_src,'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                      sep='\t',
                      names=r_cols,
                      encoding='latin-1')

R_temp = ratings.pivot(index='user_id',
                       columns='movie_id',
                       values='rating').fillna(0)

ratings_train, ratings_test = train_test_split(ratings, test_size=0.2,
                                               shuffle=True,
                                               random_state=2021)

hyper_params = {'K':30,
                'alpha':0.001,
                'beta':0.02,
                'iterations':100,
                'verbose':True
                }

mf = NEW_MF(R_temp,hyper_params)
test_set = mf.set_test(ratings_test)
result = mf.test()

Iteration : 10 | Train RMSE : 0.9675487915022118 | Test RMSE : 0.9686949463988526
Iteration : 20 | Train RMSE : 0.9436343816665653 | Test RMSE : 0.9488925519125235
Iteration : 30 | Train RMSE : 0.9328495568752477 | Test RMSE : 0.9410437364972402
Iteration : 40 | Train RMSE : 0.926396227884399 | Test RMSE : 0.9369473782623232
Iteration : 50 | Train RMSE : 0.9218466443997693 | Test RMSE : 0.93448556535977
Iteration : 60 | Train RMSE : 0.9181089521251821 | Test RMSE : 0.9328534261191967
Iteration : 70 | Train RMSE : 0.9145319107724595 | Test RMSE : 0.931580155553979
Iteration : 80 | Train RMSE : 0.9105811065864007 | Test RMSE : 0.9304330121867795
Iteration : 90 | Train RMSE : 0.9056923871078628 | Test RMSE : 0.9291106497599435
Iteration : 100 | Train RMSE : 0.8992331733616449 | Test RMSE : 0.9273652390083715


In [6]:
rating_matrix = ratings_train.pivot(index='user_id', columns='movie_id', values='rating')

rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                               index=rating_matrix.index,
                               columns=rating_matrix.index)

In [7]:
def CF_knn_bias(user_id, movie_id, neighbor_size=0):
  if movie_id in rating_bias.columns:
    sim_scores = user_similarity[user_id].copy()
    movie_ratings = rating_bias[movie_id].copy()
    # 평가하지 않은 것 제외
    none_rating_idx = movie_ratings[movie_ratings.isnull()].index
    movie_ratings = movie_ratings.drop(none_rating_idx)
    sim_scores = sim_scores.drop(none_rating_idx)

    if neighbor_size == 0:
      prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
      prediction = prediction + rating_mean[user_id]

    else:
      if len(sim_scores) > 1:
        # 지정한 이웃 수보다 유사도 수가 작은지 검사
        neighbor_size = min(neighbor_size, len(sim_scores))
        # 연산준비
        sim_scores = np.array(sim_scores)
        movie_ratings = np.array(movie_ratings)
        # 작은 유사도 index부터 sorting
        user_idx = np.argsort(sim_scores)
        # 유사도 큰 애들을 이웃 사이즈 만큼 가져오기
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        movie_ratings = movie_ratings[user_idx][-neighbor_size:]
        #예측
        prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        prediction = prediction + rating_mean[user_id]
      else:
        prediction = rating_mean[user_id]
  else:
    prediction = rating_mean[user_id]
  
  return prediction

def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((np.array(y_true - y_pred))**2))


## Hybrid Recom

In [11]:
def recommender_mf(recom_list, mf):
  recommendations = np.array([mf.get_one_predict(user, movie) for (user, movie) in recom_list]) # recom list에 있는 순서대로 mf로 예측
  return recommendations
def recommender_cf(recom_list, neighbor_size=0):
  recommendations = np.array([CF_knn_bias(user, movie, neighbor_size) for (user,movie) in recom_list])
  return recommendations

recom_list = np.array(ratings_test.iloc[:, [0,1]]) # 전체에 대해 유저id, 무비 id를 받아오겠다.

predictions_mf = recommender_mf(recom_list, mf)
predictions_cf = recommender_cf(recom_list, 37)

print('reco_mf : ', RMSE(ratings_test.iloc[:,2], predictions_mf))
print('reco_cf : ', RMSE(ratings_test.iloc[:,2], predictions_cf))

weight = [0.8, 0.2]
predictions = predictions_mf * weight[0] + predictions_cf * weight[1]
print('hybrid : ', RMSE(ratings_test.iloc[:,2],predictions))

reco_mf :  0.9273652390083715
reco_cf :  0.929036632035968
hybrid :  0.9237143527699725


## Hybrid Weight Fine tuning

In [22]:
result = []
weight_rate = []
for i in np.arange(0,0.1,0.01):
  weight = [i, 1.0 - i]
  predictions = predictions_mf * weight[0] + predictions_cf * weight[1]
  print("weights - %.2f : %.2f RMSE = %.7f"%(weight[0], weight[1], RMSE(ratings_test.iloc[:,2],predictions)))
  result.append(RMSE(ratings_test.iloc[:,2], predictions))
  weight_rate.append(weight)
print(min(result))
index_min = result.index(min(result))
print(weight_rate[index_min])

weights - 0.00 : 1.00 RMSE = 0.9290366
weights - 0.01 : 0.99 RMSE = 0.9287742
weights - 0.02 : 0.98 RMSE = 0.9285166
weights - 0.03 : 0.97 RMSE = 0.9282640
weights - 0.04 : 0.96 RMSE = 0.9280162
weights - 0.05 : 0.95 RMSE = 0.9277734
weights - 0.06 : 0.94 RMSE = 0.9275354
weights - 0.07 : 0.93 RMSE = 0.9273024
weights - 0.08 : 0.92 RMSE = 0.9270743
weights - 0.09 : 0.91 RMSE = 0.9268511
0.9268510528644012
[0.09, 0.91]
