In [1]:
import numpy as np
import pandas as pd

In [89]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../dataset/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('../dataset/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../dataset/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

In [90]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [91]:
#학습, 평가 데이터로 분리
from sklearn.model_selection import train_test_split
x= ratings.copy()
y= ratings['user_id']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.8,stratify=y)


In [92]:
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

In [93]:
#model을 통해 구한 값을 실제값과 비교하여 RMSE값을 산출하는 함수
def score(model):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true,y_pred)

In [94]:
rating_matrix = x_train.pivot(index='user_id',columns='movie_id',values='rating')

In [95]:
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1643,1646,1647,1664,1665,1671,1673,1674,1678,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,3.0,3.0,,4.0,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,,,,,5.0,,,...,,,,,,,,,,
941,5.0,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [96]:
def best_seller(user_id,movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

In [97]:
train_mean = x_train.groupby(['movie_id'])['rating'].mean()
score(best_seller)


1.0495903687896544

In [98]:
merged_ratings = pd.merge(x_train,users)
users = users.set_index('user_id')

In [99]:
users

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [100]:
# 성별에 따라 영화의 평균평점을 계산
g_mean = merged_ratings[['movie_id','sex','rating']].groupby(['movie_id','sex'])['rating'].mean()

In [101]:
#성별에 따라 평균을 예측치로 돌려주는 함수
def cf_gender(user_id,movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating= 3.0
    return gender_rating

        

In [102]:
score(cf_gender)

1.0774471225219988

In [103]:
merged_ratings

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,26,864,2,49,M,engineer,21044
1,26,678,2,49,M,engineer,21044
2,26,24,3,49,M,engineer,21044
3,26,1016,3,49,M,engineer,21044
4,26,1008,3,49,M,engineer,21044
...,...,...,...,...,...,...,...
19995,306,25,3,45,M,other,73132
19996,306,303,3,45,M,other,73132
19997,306,756,3,45,M,other,73132
19998,306,744,4,45,M,other,73132


In [107]:
#직업에 따라 영화들의 평균을 계산
o_mean = merged_ratings[['movie_id','occupation','rating']].groupby(['movie_id','occupation'])['rating'].mean()

In [108]:
o_mean

movie_id  occupation   
1         administrator    3.571429
          artist           4.500000
          educator         3.875000
          engineer         3.857143
          entertainment    3.250000
                             ...   
1671      student          1.000000
1673      executive        3.000000
1674      artist           4.000000
1678      student          1.000000
1682      engineer         3.000000
Name: rating, Length: 9291, dtype: float64

In [109]:
#직업에 따라 평균을 예측치로 돌려주는 함수
def cf_occupation(user_id,movie_id):
    if movie_id in rating_matrix:
        occupation = users.loc[user_id]['occupation']
        if occupation in o_mean[movie_id]:
            occupation_rating = o_mean[movie_id][occupation]
        else:
            occupation_rating = 3.0
    else:
        occupation_rating = 3.0
    return occupation_rating


In [111]:
score(cf_occupation)

1.2008123005303888

In [114]:
g_o_mean = merged_ratings[['movie_id','occupation','sex','rating']].groupby(['movie_id','occupation','sex'])['rating'].mean()

In [115]:
g_o_mean

movie_id  occupation     sex
1         administrator  F      3.666667
                         M      3.500000
          artist         M      4.500000
          educator       F      3.000000
                         M      4.000000
                                  ...   
1671      student        F      1.000000
1673      executive      F      3.000000
1674      artist         M      4.000000
1678      student        M      1.000000
1682      engineer       M      3.000000
Name: rating, Length: 11175, dtype: float64

In [140]:
#성별과 직업을 고려하여 평균으로 집단을 나누어 예측하는 함수
def cf_gender_occupation(user_id,movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        occupation = users.loc[user_id]['occupation']
        if gender in g_o_mean[movie_id] and occupation in g_o_mean[movie_id]:
            g_o_rating = g_o_mean[movie_id][occupation][gender]
        else:
            g_o_rating=3.0
    else:
        g_o_rating=3.0
    return g_o_rating
        

In [141]:
score(cf_gender_occupation)

1.2435584023277717