# 2.1 데이터 읽기

In [80]:
import os

os.listdir('../data2/')

['movies_metadata.csv', 'ratings-20m.csv', 'u.data', 'u.item', 'u.user']

In [81]:
import pandas as pd

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../data2/u.user', sep='|', names=u_cols, encoding='latin-1')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [82]:
i_cols = ['movie_id', 'title', 'release_date', 'video release date', 'IMDB URL', 'unknown',
          'Action', 'Adventure', 'Animation', 'children\s', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
          'Western']
movies = pd.read_csv('../data2/u.item', sep='|', names=i_cols, encoding='latin-1')
movies.set_index('movie_id', inplace=True)
movies.head()

Unnamed: 0_level_0,title,release_date,video release date,IMDB URL,unknown,Action,Adventure,Animation,children\s,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [83]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data2/u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings.set_index('user_id', inplace=True)
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


# 2.2 인기제품 방식 

개별 사용자에 대한 정보가 없는 경우나, 정확도에 관계없이 갖아 간단한 추천을 제공해야하는 상황  
-> 가장 인기있는 제품을 추천하는 것이 가장 합리적

In [84]:
def get_recomm_movie_1(n_items: int) -> "pd.Series[str]":
  sorted_mean_ratings = ratings.groupby(['movie_id'])['rating'].mean().sort_values(axis=0, ascending=False)
  recom_movies = movies.loc[sorted_mean_ratings.index[:n_items]]
  return recom_movies['title']

In [85]:
get_recomm_movie_1(5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

In [86]:
def get_recomm_movie_2(n_items: int) -> "pd.Series[str]":
  sorted_mean_ratings = ratings.groupby(['movie_id'])['rating'].mean().sort_values(axis=0, ascending=False)
  return movies.loc[sorted_mean_ratings.index[:n_items]]['title']

# 2.3 추천 시스템의 정확도 측정

추천시스템은 기술적으로 이야기하면 각 아이템에 대한 사용자의 선호도를 예측하는 것  
지표로 사용되는 지표 중 하나 RMSE   
RMSE = $\sqrt {\frac{1}{N}\sum_{i=1}^N (y_i-\hat y_i)^2}$

In [87]:
import numpy as np

def RMSE(y_true:"pd.Series", y_pred:"pd.Series") -> float:
  return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

In [88]:
movie_mean = ratings.groupby(['movie_id'])['rating'].mean()

rmse = []

for user in set(ratings.index):
  y_true = ratings.loc[user]['rating']
  y_pred = movie_mean[ratings.loc[user]['movie_id']]
  accuracy = RMSE(y_true, y_pred)
  rmse.append(accuracy)
  
print(rmse)
print(np.mean(rmse))

[0.9871739413758743, 0.8219422910732068, 1.2339723339608757, 1.2918964277456093, 1.2046670551586538, 0.8888721484619234, 1.0491049435867215, 0.925457378244275, 1.0714273973536004, 0.6563475075478296, 0.8647213856595464, 0.9734133550650361, 1.1956220934186141, 1.0633605424586277, 1.3152141045097092, 1.0717856405511164, 1.1026401621920408, 0.7736177002957276, 0.9095862832504693, 1.3203202362893989, 1.0345029928785174, 1.1155476238882485, 0.8204994526335332, 0.924963783904287, 0.647250029758992, 0.7564970553541654, 0.8092982805806483, 0.8264691238381775, 0.6702295646009502, 0.860031404200849, 0.9246176979692791, 0.8031823880989896, 0.6562472831748011, 1.497398029739686, 0.912302608707388, 1.6345699241761138, 0.8907408053367615, 1.657187237481364, 1.1470717491085198, 1.132272528219413, 0.7438374514309243, 0.9903055210506982, 0.8988429934758936, 0.8842041087906171, 0.815774909806917, 0.9657495341526146, 0.7720186761074518, 0.9090988016488181, 1.2988090805761887, 1.4302004692099255, 1.198907

# 2.4 사용자 집단별 추천

In [89]:
import pandas as pd

def get_dataset_1() -> "pd.DataFrame":
  u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
  i_cols = ['movie_id', 'title', 'release_date', 'video release date', 'IMDB URL', 'unknown',
            'Action', 'Adventure', 'Animation', 'children\s', 'Comedy', 'Crime', 'Documentary', 'Drama',
            'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
            'Western']
  r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

  users = pd.read_csv('../data2/u.user', sep='|', names=u_cols, encoding='latin-1')
    
  movies = pd.read_csv('../data2/u.item', sep='|', names=i_cols, encoding='latin-1')
  movies = movies[['movie_id', 'title']]
  
  ratings = pd.read_csv('../data2/u.data', sep='\t', names=r_cols, encoding='latin-1')
  ratings.drop('timestamp', axis=1, inplace=True)
  
  return users, movies, ratings

In [90]:
users, movies, ratings = get_dataset_1()

In [91]:
from sklearn.model_selection import train_test_split

X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [92]:
import numpy as np

def RMSE(y_true:"pd.Series" or "np.array", y_pred:"pd.Series" or "np.array") -> float:
  return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

In [93]:
# train data로 full matrix 구하기
rating_matrix = X_train.pivot(index="user_id", columns='movie_id', values='rating')
display(X_train.head(), rating_matrix.head())

Unnamed: 0,user_id,movie_id,rating
83657,836,210,4
74565,192,255,2
78935,574,242,5
69504,586,195,4
47152,734,82,4


movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1671,1672,1673,1675,1676,1678,1679,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,4.0,,3.0,,4.0,,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [94]:
# 전체 평점으로 예측하는 모델
def best_seller(user_id: str, movie_id:str) -> float:
  X_train_mean = X_train.groupby(['movie_id'])['rating'].mean()
  # train set의 rating 평점의 평균으로 예측
  try:
    rating = X_train_mean['movie_id']
  # 해당 movie id가 없을 경우 3.0으로 예측
  except:
    rating = 3.0
  return rating

# 모델별 RMSE를 계산하는 함수
def score(model):
  id_pairs = zip(X_test['user_id'], X_test['movie_id'])
  
  y_pred = np.array([model(user_id, movie_id) for (user_id, movie_id) in id_pairs])
  y_true = np.array(X_test['rating'])

  return RMSE(y_true, y_pred)

In [95]:
# score(best_seller)

In [96]:
# Full matrix를 사용자 데이터와 merge
merged_ratings = pd.merge(X_train, users, how='inner', on='user_id')
users = users.set_index('user_id')
display(merged_ratings.head(), users.head())

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,836,210,4,44,M,artist,10018
1,836,42,3,44,M,artist,10018
2,836,657,5,44,M,artist,10018
3,836,663,5,44,M,artist,10018
4,836,258,4,44,M,artist,10018


Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [97]:
g_mean = merged_ratings[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()
g_mean

movie_id  sex
1         F      3.866667
          M      3.912863
2         F      3.428571
          M      3.218391
3         F      2.416667
                   ...   
1676      M      2.000000
1678      M      1.000000
1679      M      3.000000
1681      M      3.000000
1682      M      3.000000
Name: rating, Length: 3039, dtype: float64

In [172]:
# Gender 기준 추천
# gender 별 평균을 예측치로 돌려주는 함수
def cf_gender(user_id:str, movie_id: str) -> float:
  g_mean_movie_id_list = [index[0] for index in g_mean.index.tolist()]

  gender = users.loc[user_id]['sex']
  # movie id가 merge한 행렬에 존재하고 , user id의 성별이 merge한 행렬의 movie id에 존재할 때
  # movie id와 gender 그룹의 평균으로 예측
  if movie_id in g_mean_movie_id_list and gender in g_mean[movie_id]:
    gender_rating = g_mean[movie_id][gender]
  # 그렇지 않으면 3.0으로 예측
  else:
    gender_rating = 3.0
  
  return gender_rating  
    
    

In [173]:
score(cf_gender)

1.035597889831803

### 연습문제

  
2-1) 위의 성별('gender') 추천 코드를 수정해서 사용자의 직업('occupation')에 따라 집단을 나누어서 예측값을 구하는 함수를 만들고 이의 정확도를 계산하는 코드를 작성하시오

In [130]:
o_mean = merged_ratings[['movie_id', 'occupation', 'rating']].groupby(['movie_id', 'occupation'])['rating'].mean()

In [174]:
# occupation  기준 추천
# occupation  별 평균을 예측치로 돌려주는 함수
def cf_occupation(user_id:str, movie_id: str) -> float:
  o_mean_movie_id_list = [index[0] for index in o_mean.index.tolist()]

  occupation = users.loc[user_id]['occupation']
  # movie id가 merge한 행렬에 존재하고 , user id의 성별이 merge한 행렬의 movie id에 존재할 때
  # movie id와 occupation  그룹의 평균으로 예측
  if movie_id in o_mean_movie_id_list and occupation in o_mean[movie_id]:
    occupation_rating = o_mean[movie_id][occupation]
  # 그렇지 않으면 3.0으로 예측
  else:
    occupation_rating = 3.0
  
  return occupation_rating  
    
    

In [175]:
score(cf_occupation)

1.124649826168198

2-2) 사용자의 성별과 직업('occupation')을 동시에 고려한 집단을 난워서 예측값을 구하는 함수를 만들고 이의 정확도를 계산하는 코드를 작성하시오.

In [134]:
g_and_o_mean = merged_ratings[['movie_id', 'sex', 'occupation', 'rating']].groupby(['movie_id', 'sex', 'occupation'])['rating'].mean()

In [203]:
# Gender 기준 추천
# gender 별 평균을 예측치로 돌려주는 함수
def cf_gender_occupation(user_id:str, movie_id: str) -> float:
  g_and_o_mean_movie_id_list = [index[0] for index in g_and_o_mean.index.tolist()]
  
  gender = users.loc[user_id]['sex']
  occupation = users.loc[user_id]['occupation']
  
  # movie id가 merge한 행렬에 존재하고 , user id의 성별이 merge한 행렬의 movie id에 존재할 때
  # movie id와 gender 그룹의 평균으로 예측
  if movie_id in g_and_o_mean_movie_id_list and \
      gender in g_and_o_mean[movie_id] and \
      occupation in g_and_o_mean[movie_id][gender]:
    occupation_rating = g_and_o_mean[movie_id][gender][occupation]
  # 그렇지 않으면 3.0으로 예측
  else:
    occupation_rating = 3.0
  
  return occupation_rating  
    

In [204]:
score(cf_gender_occupation)