# 3.3 기본 CF 알고리즘

In [11]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### 데이터 불러오기 및 필요한 함수 정의 ####

#  user데이터
base_src = './Data'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src,
                    sep = '|',
                    names = u_cols,
                    encoding='latin-1')
users = users.set_index('user_id')


# movie 데이터
u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id', 'title','release date', 'video release date',
            'IMDB URL', 'unknown', 'Action','Adventure','Animation',
            'Children\'s', 'Comedy', 'Crime','Documentary','Drama','Fantasy',
            'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src,
                    sep='|',
                    names=i_cols,
                    encoding='latin-1')
movies = movies.set_index('movie_id')

# rating 데이터
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                        sep='\t',
                        names=r_cols,
                        encoding='latin-1')

# RMSE 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

# score(RMSE) 계산
def score(model):
    # 테스트 데이터의 user_id와 movie_id간 pair를 맞춰 튜플형 원소 리스트데이터를 만듦.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에서 대해서 주어진 예측 모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)


###### 데이터셋 만들기 ######
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

ratings_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

##### 코사인 유사도 계산 #####
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
# pd.DataFrame(user_similarity)
user_similarity = pd.DataFrame(user_similarity,
                                index=ratings_matrix.index,
                                columns=ratings_matrix.index)


#### 주어진 영화의 (movie_id) 가중 평균 rating을 계산하는 함수 ####
def CF_simple(user_id, movie_id):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_ratings_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_ratings_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()

    else:
        mean_rating = 3.0
    return mean_rating

#### 정확도 계산 ####
score(CF_simple)

1.0169396412312266

In [7]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.167475,0.020191,0.039798,0.297028,0.316073,0.351824,0.186010,0.072262,0.266069,...,0.270688,0.110942,0.221324,0.200087,0.182838,0.084611,0.305987,0.125474,0.142787,0.309173
2,0.167475,1.000000,0.049921,0.136059,0.054034,0.199332,0.098998,0.097644,0.149016,0.142020,...,0.134538,0.287440,0.278580,0.414659,0.245380,0.226573,0.228027,0.140147,0.120413,0.095338
3,0.020191,0.049921,1.000000,0.331186,0.000000,0.084230,0.040335,0.073655,0.079531,0.044186,...,0.009252,0.000000,0.080859,0.077540,0.079100,0.000000,0.085910,0.046812,0.153801,0.000000
4,0.039798,0.136059,0.331186,1.000000,0.013009,0.027840,0.063617,0.129836,0.000000,0.000000,...,0.014678,0.000000,0.104229,0.177691,0.117125,0.000000,0.069387,0.136157,0.130458,0.036406
5,0.297028,0.054034,0.000000,0.013009,1.000000,0.163537,0.304846,0.205443,0.044580,0.130982,...,0.282285,0.012708,0.050599,0.033572,0.082193,0.045358,0.212185,0.069671,0.177189,0.244168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.084611,0.226573,0.000000,0.000000,0.045358,0.081536,0.068369,0.097502,0.000000,0.048143,...,0.037531,0.349827,0.090386,0.209692,0.341866,1.000000,0.069122,0.153208,0.039309,0.092480
940,0.305987,0.228027,0.085910,0.069387,0.212185,0.293066,0.264376,0.197424,0.133528,0.246606,...,0.295447,0.039216,0.125821,0.198247,0.132105,0.069122,1.000000,0.039815,0.206281,0.161205
941,0.125474,0.140147,0.046812,0.136157,0.069671,0.126866,0.075428,0.164109,0.000000,0.070974,...,0.040198,0.171394,0.218108,0.155721,0.331758,0.153208,0.039815,1.000000,0.035286,0.085458
942,0.142787,0.120413,0.153801,0.130458,0.177189,0.274914,0.266272,0.185634,0.090780,0.223458,...,0.184115,0.000000,0.067045,0.126248,0.079181,0.039309,0.206281,0.035286,1.000000,0.122316
