<a href="https://colab.research.google.com/github/minshyee/RecoSyS/blob/main/Basic_RecomSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic RecomSys

## Data
- MovieLens 100k
  - u.user : user data
  - u.item : movie info data
  - u.data : rating data



In [1]:
cd /content/

/content


In [8]:
# set enviroment
import os
import pandas as pd
import numpy as np

### Data 

In [3]:
# user 
base_src = 'drive/MyDrive/Recosys/Data'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation','zip_code']
users = pd.read_csv(u_user_src,
                    sep='|',
                    names=u_cols,
                    encoding='latin-1')

users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [4]:
# item
u_item_src = os.path.join(base_src, 'u.item')
item_cols = ['movie_id','title','release date','video release date',' IMDB URL','unknown', 'Action', 'Adventure', 'Animation', 'Children\'s','Comedy','Crime','Documentary','Drama', 'Fantasy', 'Film-Noir', ' Horror', 'Musical', 'Mysterty', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv(u_item_src,
                    sep='|',
                    names=item_cols, 
                    encoding='latin-1')
movies = items.set_index('movie_id')
movies.head()

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mysterty,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [16]:
# data
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
datas = pd.read_csv(u_data_src,
                    sep='\t',
                    names=r_cols,
                    encoding='latin-1')

ratings = datas.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


### 인기 제품 방식 (Best Seller Method)

- 개별 사용자 정보 X
- 간단한 추천을 제공할 때

모든 사용자에게 best-seller 상품을 추천

In [7]:
# best-seller function

def recom_movie(n_items):
  movie_mean = ratings.groupby(['movie_id'])['rating'].mean() # movie_id기준으로 rating 평균
  movie_sort = movie_mean.sort_values(ascending=False)[:n_items] # rating 평균으로 sorting -> n_items 만큼 뽑아냄
  recom_movies = movies.loc[movie_sort.index] 
  recommendations = recom_movies['title']
  return recommendations

recom_movie(5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

### 추천 시스템의 정확도 측정
- 추천 시스템의 성능 = "정확성"

$${RMSE = \sqrt{\frac{1}{n}*\sum_{i=1}^n (y_i - \hat{y_i})^2}}$$

In [9]:
# 정확도 측정 RMSE

def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

rmse = []
movie_mean = ratings.groupby(['movie_id'])['rating'].mean()

# 각 사용자가 평가한 모든 평점 값 저장
for user in set(ratings.index):
  y_true = ratings.loc[user]['rating']
  #best-seller
  y_pred = movie_mean[ratings.loc[user]['movie_id']]
  accuracy = RMSE(y_true, y_pred)
  rmse.append(accuracy)

print(np.mean(rmse))

0.996007224010567


### 사용자 집단별 추천
- 집단 간 평가 경향 반영


In [17]:
# rating timestamp 제거
ratings = ratings.drop('timestamp', axis=1).reset_index()
movies = movies=[['movie_id', 'title']]

In [20]:
# train, test 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, stratify=y) #stratify : 계층화 추출 :원래 데이터의 분포와 유사하게 데이터 추출

# 위 RMSE 함수 사용
# 모델별 RMSE 구하는 함수
def score(model):
  id_pairs = zip(x_test['user_id'],x_test['movie_id']) # user_id - movie_id 
  y_pred = np.array([model(user,movie) for (user, movie) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred) 

# best-seller
train_mean = x_train.groupby(['movie_id'])['rating'].mean()
def best_seller(user_id, movie_id):
  try:
    rating = train_mean[movie_id]
  except:
    rating = 3.0
  return rating

score(best_seller)

1.0286880214497196

In [30]:
# 성별에 따른 예측 값
merged_ratings = pd.merge(x_train, users.reset_index())

g_mean = merged_ratings[['movie_id','sex','rating']].groupby(['movie_id','sex'])['rating'].mean()

rating_matrix = x_train.pivot(index='user_id',
                              columns='movie_id',
                              values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1672,1673,1674,1676,1677,1678,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [31]:
# Gender 기준 추천
def cf_gender(user_id, movie_id):
  if movie_id in rating_matrix.columns:
    gender = users.loc[user_id]['sex']
    if gender in g_mean[movie_id].index:
      gender_rating = g_mean[movie_id][gender]
    else:
      gender_rating = 3.0
  else:
    gender_rating = 3.0
  return gender_rating

score(cf_gender)


1.0355716086779276