<a href="https://colab.research.google.com/github/stebechoi/CP2/blob/YJ/MF_%EC%B5%9C%EC%A2%85.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 데이터 불러오기

In [4]:
data_path = '/content/drive/MyDrive/CP2/ml-100k/'
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(data_path + "u.data",names=r_cols, sep='\t',encoding='latin-1')
ratings = ratings[['user_id','movie_id','rating']].astype(int) #timestamp 제거

u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('/content/drive/MyDrive/CP2/ml-100k/u.user',sep='\|',names=u_cols,encoding='latin-1')
users = users[['user_id','occupation']]

item_df = pd.read_csv(data_path + 'u.item', sep='|', encoding='latin-1', header=None,
                        names=['movie_id', 'movie_title', 'release_date', 'video_release_date',
                               'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                               'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama',
                               'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                               'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

item_df = item_df.iloc[:,:2]

  users = pd.read_csv('/content/drive/MyDrive/CP2/ml-100k/u.user',sep='\|',names=u_cols,encoding='latin-1')


## 데이터 병합

In [5]:
ratingcount = ratings.groupby(['movie_id'])['rating'].count().reset_index().rename(columns = {'rating': 'TotalRatingCount'})
n_ratings = pd.merge(ratings,ratingcount,how='left',on='movie_id')
n_ratings = pd.merge(n_ratings,item_df,how='left', on='movie_id')

df_movie_100 = n_ratings[n_ratings['TotalRatingCount']>=100]
df_movie_100 = df_movie_100.reset_index(drop=True)
ratings = df_movie_100.iloc[:,:3]

## Train / Test

In [6]:
train_size = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(train_size*len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

## MF 모델

In [7]:
class MF():
  def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
    self.R = np.array(ratings)
    self.num_users, self.num_items = np.shape(self.R)
    self.K = K
    self.alpha = alpha
    self.beta = beta
    self.iterations = iterations
    self.verbose = verbose
    #사용자와 아이템의 인덱스를 매핑해줌. pivot으로 만든 데이터프레임을 변수로 넣어줄예정
    item_id_index = []
    index_item_id = []
    for i, one_id in enumerate(ratings):
      item_id_index.append([one_id, i])
      index_item_id.append([i, one_id])
    self.item_id_index = dict(item_id_index)
    self.index_item_id = dict(index_item_id)
    user_id_index = []
    index_user_id = []
    for i, one_id in enumerate(ratings.T):
      user_id_index.append([one_id, i])
      index_user_id.append([i, one_id])
    self.user_id_index = dict(user_id_index)
    self.index_user_id = dict(index_user_id)

    # self.test_matrix = np.zeros((self.num_users, self.num_items))
    # for row in ratings_test.itertuples():
    #     self.test_matrix[row[1]-1, row[2]-1] = row[3]

  def rmse(self):
    xs, ys = self.R.nonzero()
    #평점이 있는(0이 아닌) 요소의 인덱스를 가져온다
    self.predictions = []
    self.errors = []
    for x,y in zip(xs,ys):
      prediction = self.get_prediction(x,y)
      self.predictions.append(prediction)
      self.errors.append(self.R[x,y]-prediction)
    self.predictions = np.array(self.predictions)
    self.errors = np.array(self.errors)
    
    return np.sqrt(np.mean(self.errors**2))
  
  def set_test(self, ratings_test):
    test_set = []
    for i in range(len(ratings_test)):
      x = self.user_id_index[ratings_test.iloc[i,0]]
      y = self.item_id_index[ratings_test.iloc[i,1]]
      z = ratings_test.iloc[i,2]
      test_set.append([x,y,z])
      self.R[x,y] = 0
    self.test_set = test_set
    return test_set

  def test_rmse(self):
    error = 0
    for one_set in self.test_set:
      predicted = self.get_prediction(one_set[0], one_set[1])
      error += pow(one_set[2] - predicted, 2)
    return np.sqrt(error/len(self.test_set))

  def test(self):
    self.P = np.random.normal(scale=1./self.K, size=(self.num_users,self.K))
    self.Q = np.random.normal(scale=1./self.K, size=(self.num_items,self.K))

    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows, columns = self.R.nonzero()
    self.samples = [(i,j,self.R[i,j]) for i,j in zip(rows, columns)]

    training_process=[]
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse1 = self.rmse()
      rmse2 = self.test_rmse()
      training_process.append((i+1, rmse1, rmse2))
      if self.verbose:
        if (i+1)%10 ==0:
          print("Iteration: %d ; Train RMSE = %.4f; Test RMSE = %.4f" % (i+1,rmse1, rmse2))
    return training_process


  def get_one_prediction(self, user_id, item_id):
    return self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])

  def full_prediction(self):
    return self.b + self.b_u[:, np.newaxis] + self.b_d[np.newaxis, :] + self.P.dot(self.Q.T)

  def sgd(self):
    for i,j,r in self.samples:
      prediction = self.get_prediction(i,j)
      e = (r - prediction)

      self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
      self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])

      self.P[i,:] += self.alpha * (e*self.Q[j,:] - self.beta*self.P[i,:])
      self.Q[j,:] += self.alpha * (e*self.P[i,:] - self.beta*self.Q[j,:])

  def get_prediction(self,i,j):
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i,:].dot(self.Q[j,:].T)
    return prediction

  def recommendation(self, user_idx,N):
    user_ratings = self.full_prediction()[user_idx-1]
    top_N_indices = np.argsort(user_ratings)[-N:]
    rec_list = []
    for i, r in item_df.iterrows():
      if i in top_N_indices:
        rec_list.append(r['movie_title'])
    return rec_list


  def precision_and_recall(self, k=10):
        user_precision_at_k = []
        user_recall_at_k = []
        
        for user_idx in range(self.num_users):
            user_ratings = self.R[user_idx]
            user_predicted_ratings = self.full_prediction()[user_idx]

            # 관심있는 아이템 (실제 평점이 있는 아이템) 인덱스 추출
            relevant_items = np.nonzero(user_ratings > 0)[0]

            if len(relevant_items) == 0:
                continue

            # 예측 평점 상위 k개 아이템 인덱스 추출
            top_k_items = np.argsort(user_predicted_ratings)[-k:]

            # 추천된 상위 k개 아이템 중 관심있는 아이템의 비율 계산
            hits = len(np.intersect1d(relevant_items, top_k_items))
            precision = hits / k
            recall = hits / len(relevant_items)
            user_precision_at_k.append(precision)
            user_recall_at_k.append(recall)

        # 전체 사용자의 precision@k 평균 계산
        mean_precision_at_k = np.mean(user_precision_at_k)
        mean_recall_at_k = np.mean(user_recall_at_k)
        print('Precision@{}: {:.4f},  Recall@{}: {:.4f}'.format(k, mean_precision_at_k,k,mean_recall_at_k))

In [8]:
R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

mf = MF(R_temp, K=30, alpha=0.001, beta=0.02, iterations=20, verbose=True)
test_set = mf.set_test(ratings_test)
result = mf.test()

Iteration: 10 ; Train RMSE = 0.9335; Test RMSE = 0.9410
Iteration: 20 ; Train RMSE = 0.9138; Test RMSE = 0.9261


## 추천 리스트

In [9]:
a = mf.recommendation(1,3)

recommend_df = pd.DataFrame (a, columns = ['recommendation'])
recommend_df

Unnamed: 0,recommendation
0,Belle de jour (1967)
1,Unforgiven (1992)
2,Steel (1997)


## Precision@k / Recall@k

In [10]:
mf.precision_and_recall(k=5)

Precision@5: 0.2467,  Recall@5: 0.0225
