# 2. 기본적인 추천 시스템

## 2.1 데이터 불러오기

In [1]:
import pandas as pd
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./dataset/u.user', sep='|', names=u_cols, encoding='latin-1')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [2]:
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL',
          'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s',
          'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
          'War', 'western']

movies = pd.read_csv('./dataset/u.item', sep='|', names=i_cols, encoding='latin-1')
movies = movies.set_index('movie_id')
movies.head()

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./dataset/u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


## 2.2 인기제품 방식

In [4]:
# best-seller 추천
# 전체 사용자의 평점평균을 사용
def recom_movie1(n_items):
    movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie1(5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

In [5]:
def recom_movie2(n_items):
    movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
    return movies.loc[movie_mean.sort_values(ascending=False)[:n_items].index]['title']

recom_movie2(5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

## 2.3 추천 시스템의 정확도 측정

In [6]:
# 정확도 계산
import numpy as np
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

In [7]:
# 실제 영화 평점 값과
# best-seller 방식으로 구한 예측 값의
# rmse 도출
rmse = []
movie_mean = ratings.groupby(['movie_id'])['rating'].mean()

for user in set(ratings.index):
    y_true = ratings.loc[user]['rating']
    y_pred = movie_mean[ratings.loc[user]['movie_id']]
    accuracy = RMSE(y_true, y_pred)
    rmse.append(accuracy)

print(np.mean(rmse))

0.996007224010567


## 2.4 사용자 집단별 추천

In [8]:
# remove tiemstamp
ratins = ratings.drop('timestamp', axis=1)

movies = movies.reset_index()
movies = movies[['movie_id', 'title']]

In [9]:
# train, test 분리
from sklearn.model_selection import train_test_split

ratings = ratings.reset_index()
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [10]:
# 모델 별 rmse 계산
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# train 데이터로 full matrix 구하기
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

In [11]:
# 전체 평균으로 예측치 계산하는 기본 모델
def best_seller(user_id, movie_id):
    train_mean = x_train.groupby('movie_id')['rating'].mean()
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

score(best_seller)

1.0183212615208475

In [12]:
# full matrix를 사용자 데이터와 merge
users = users.reset_index()
merged_ratings = pd.merge(x_train, users)
users = users.set_index('user_id')

# gender별 평점평균 계산
g_mean = merged_ratings[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

In [13]:
# gender 기준 추천
# gender별 평균을 예측치로 돌려주는 함수
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0

    else:
        gender_rating = 3.0

    return gender_rating

score(cf_gender)

1.0279422793656428

# 3. 협업 필터링 추천 시스템

## 3.1 기본 CF 알고리즘

In [14]:
# train set의 모든 가능한 사용자 pair의 Cosine similarities 계산
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

user_similarity.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.144346,0.036129,0.043403,0.318359,0.325078,0.344004,0.272873,0.068581,0.296157,...,0.270601,0.079108,0.181219,0.150836,0.138943,0.079842,0.300533,0.065506,0.10546,0.354329
2,0.144346,1.0,0.099198,0.166062,0.084732,0.228585,0.095276,0.070908,0.141939,0.112498,...,0.153135,0.228609,0.239359,0.292195,0.223533,0.094753,0.20159,0.009019,0.096729,0.113454
3,0.036129,0.099198,1.0,0.250203,0.027794,0.080979,0.045962,0.05652,0.021862,0.055277,...,0.041612,0.056944,0.073478,0.027548,0.098986,0.0,0.156882,0.08387,0.047995,0.034592
4,0.043403,0.166062,0.250203,1.0,0.04196,0.078766,0.062578,0.145155,0.128038,0.081,...,0.068598,0.049407,0.052761,0.182522,0.089087,0.040326,0.212837,0.124747,0.099942,0.041161
5,0.318359,0.084732,0.027794,0.04196,1.0,0.186238,0.275327,0.192513,0.035273,0.13653,...,0.247415,0.089323,0.068815,0.102362,0.127224,0.065615,0.20113,0.093434,0.145714,0.257795


In [15]:
# 주어진 영화의 movie_id 가중평균 rating을 계산
# 가중치는 주어진 사용자와 다른 사용자 간의 유사도(user_similarity)
def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()

        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()

        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()

    else:
        mean_rating = 3.0
    
    return mean_rating

# 정확도 계산
score(CF_simple)

1.0119731769678217

## 3.2 이웃을 고려한 CF

In [16]:
# 모델 별 rmse 계산 => KNN 모델 평가용
def score_KNN(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [17]:
# neighbor size를 정해서 예측치 계산하는 함수
def cf_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()

        none_rating_idx = movie_ratings[movie_ratings.isnull()].index

        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)

        # neighbor size가 지정되지 않은 경우
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()

        # 지정된 경우
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))

                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)

                user_idx = np.argsort(sim_scores)

                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]

                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()

            else:
                mean_rating = 3.0

    else:
        mean_rating = 3.0

    return mean_rating

score_KNN(cf_knn, neighbor_size=30)

1.0036476622750867

In [18]:
# 주어진 사용자에 대해 추천 받기
# 전체 데이터로 full matrix와 cosine similarity 구하기
rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')

from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        else:
            user_movie.loc[movie] = cf_knn(user_id, movie, neighbor_size)
    
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']

    return recommendations

recom_movie(user_id=2, n_items=5, neighbor_size=30)

movie_id
1293                     Ayn Rand: A Sense of Life (1997)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
1189                              That Old Feeling (1997)
1467                                     Cure, The (1995)
318                       Everyone Says I Love You (1996)
Name: title, dtype: object

## 3.3 최적의 이웃 크기 결정

In [19]:
# 최적의 neighbor size 구하기
# train set으로 full matrix와 cosine similarity 구하기
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

for neighbor_size in [10, 20, 30, 40, 50, 60]:
    print("Neighbor size = %d : RMSE = %.4f"%(neighbor_size, score_KNN(cf_knn, neighbor_size)))

Neighbor size = 10 : RMSE = 1.0191
Neighbor size = 20 : RMSE = 1.0061
Neighbor size = 30 : RMSE = 1.0036
Neighbor size = 40 : RMSE = 1.0038
Neighbor size = 50 : RMSE = 1.0042
Neighbor size = 60 : RMSE = 1.0043


## 3.4 사용자의 평가경향을 고려한 CF

In [20]:
# 모든 유저의 rating 평균과 영화의 평점편차 계산
rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()

        none_rating_idx = movie_ratings[movie_ratings.isnull()].index

        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)

        # neighbor size 지정되지 않은 경우
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]

        # 지정된 경우
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))

                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)

                user_idx = np.argsort(sim_scores)
                
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]

                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]

    else:
        prediction = rating_mean[user_id]

    return prediction

score_KNN(CF_knn_bias, 30)

3.651466001497454

## 3.5 그 외의 CF 정확도 개선 방법

In [22]:
# 사용자별 공통 평가 수 계산
rating_binary1 = np.array((rating_matrix > 0).astype(float))
rating_binary2 = rating_binary1.T

counts = np.dot(rating_binary1, rating_binary2)
counts = pd.DataFrame(counts, index=rating_matrix.index, columns=rating_matrix.index).fillna(0)

def CF_knn_bias_sig(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:

        # 현 user와 다른 사용자 간의 유사도 가져오기
        sim_scores = user_similarity[user_id]

        # 현 movie의 평점편차 가져오기
        movie_ratings = rating_bias[movie_id]

        # 현 movie에 대한 rating이 없는 사용자 표시
        no_rating = movie_ratings.isnull()

        # 현 사용자와 다른 사용자간 공통 평가 아이템 수 가져오기 
        common_counts = counts[user_id]

        # 공통으로 평가한 영화의 수가 SIG_LEVEL보다 낮은 사용자 표시
        low_significance = common_counts < SIG_LEVEL

        # 평가를 안 하였거나, SIG_LEVEL이 기준 이하인 user 제거
        none_rating_idx = movie_ratings[no_rating | low_significance].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)

        if neighbor_size == 0:
            # 편차로 예측값(편차 예측값) 계산
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            # 편차 예측값에 현 사용자의 평균 더하기
            prediction = prediction + rating_mean[user_id]

        else:
            # 해당 영화를 평가한 사용자가 최소 MIN_RATINGS 이상인 경우에만 계산            
            if len(sim_scores) > MIN_RATINGS:

                # 지정된 neighbor size 값과 해당 영화를 평가한 총사용자 수 중 작은 것으로 결정
                neighbor_size = min(neighbor_size, len(sim_scores))

                # array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)

                # 유사도를 순서대로 정렬
                user_idx = np.argsort(sim_scores)

                # 유사도와 rating을 neighbor size만큼 받기
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]

                # 편차로 예측치 계산
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()

                # 예측값에 현 사용자의 평균 더하기
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]

    else:
        prediction = rating_mean[user_id]
    
    return prediction

SIG_LEVEL = 3
MIN_RATINGS = 2
score_KNN(CF_knn_bias_sig, 30)

0.9409155263045955

## 3.6 사용자 기반 CF와 아이템 기반 CF

In [23]:
# train 데이터로 Full matrix 구하기  
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

In [24]:
# train set의 모든 가능한 아이템 pair의 Cosine similarities 계산
from sklearn.metrics.pairwise import cosine_similarity

rating_matrix_t = np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity, index=rating_matrix_t.index, 
                               columns=rating_matrix_t.index)

In [25]:
# 주어진 영화의 (movie_id) 가중평균 rating을 계산하는 함수, 
# 가중치는 주어진 아이템과 다른 아이템 간의 유사도(item_similarity)
def CF_IBCF(user_id, movie_id):
    if movie_id in item_similarity:      # 현재 영화가 train set에 있는지 확인

        # 현재 영화와 다른 영화의 similarity 값 가져오기
        sim_scores = item_similarity[movie_id]

        # 현 사용자의 모든 rating 값 가져오기
        user_rating = rating_matrix_t[user_id]

        # 사용자가 평가하지 않은 영화 index 가져오기
        non_rating_idx = user_rating[user_rating.isnull()].index

        # 사용자가 평가하지 않은 영화 제거
        user_rating = user_rating.dropna()

        # 사용자가 평가하지 않은 영화의 similarity 값 제거
        sim_scores = sim_scores.drop(non_rating_idx)

        # 현 영화에 대한 예상 rating 계산, 가중치는 현 영화와 사용자가 평가한 영화의 유사도
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
    else:
        mean_rating = 3.0
        
    return mean_rating

# 정확도 계산
score(CF_IBCF)

1.015044393297824