# 3.3 기본 CF 알고리즘

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### 데이터 불러오기 및 필요한 함수 정의 ####

#  user데이터
base_src = './Data'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src,
                    sep = '|',
                    names = u_cols,
                    encoding='latin-1')
users = users.set_index('user_id')


# movie 데이터
u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id', 'title','release date', 'video release date',
            'IMDB URL', 'unknown', 'Action','Adventure','Animation',
            'Children\'s', 'Comedy', 'Crime','Documentary','Drama','Fantasy',
            'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src,
                    sep='|',
                    names=i_cols,
                    encoding='latin-1')
movies = movies.set_index('movie_id')

# rating 데이터
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                        sep='\t',
                        names=r_cols,
                        encoding='latin-1')

# RMSE 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

# score(RMSE) 계산
def score(model):
    # 테스트 데이터의 user_id와 movie_id간 pair를 맞춰 튜플형 원소 리스트데이터를 만듦.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에서 대해서 주어진 예측 모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)


###### 데이터셋 만들기 ######
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

ratings_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

##### 코사인 유사도 계산 #####
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
# pd.DataFrame(user_similarity)
user_similarity = pd.DataFrame(user_similarity,
                                index=ratings_matrix.index,
                                columns=ratings_matrix.index)


#### 주어진 영화의 (movie_id) 가중 평균 rating을 계산하는 함수 ####
def CF_simple(user_id, movie_id):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_ratings_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_ratings_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()

    else:
        mean_rating = 3.0
    return mean_rating

#### 정확도 계산 ####
score(CF_simple)

1.0173773693331338

# 3.4 이웃을 고려한 CF

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

##### 데이터 불러오기 및 데이터셋 만들기 #####
#  user데이터
base_src = './Data'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src,
                    sep = '|',
                    names = u_cols,
                    encoding='latin-1')
users = users.set_index('user_id')


# movie 데이터
u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id', 'title','release date', 'video release date',
            'IMDB URL', 'unknown', 'Action','Adventure','Animation',
            'Children\'s', 'Comedy', 'Crime','Documentary','Drama','Fantasy',
            'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src,
                    sep='|',
                    names=i_cols,
                    encoding='latin-1')
movies = movies.set_index('movie_id')

# rating 데이터
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                        sep='\t',
                        names=r_cols,
                        encoding='latin-1')

# RMSE 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

########################################################################################
# score(RMSE) 계산
def score(model, neighbor_size=0):      # neighbor_size 지정 
    # 테스트 데이터의 user_id와 movie_id간 pair를 맞춰 튜플형 원소 리스트데이터를 만듦.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에서 대해서 주어진 예측 모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

########################################################################################
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

ratings_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

##### train set의 모든 가능한 사용자의 pair의 cosine similarity 계산 #####
# 코사인 유사도 계산하는 사이킷런의 라이브러리
from sklearn.metrics.pairwise import cosine_similarity
# 코사인 유사도를 구하기 위해 rating값을 복사하고, 계산 시 NaN값 에러 대비를 위해 결측치 0으로 대체
matrix_dummy = ratings_matrix.copy().fillna(0)
# 모든 사용자 간 코사인 유사도를 구함.
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
# 필요한 값 조회를 위해 인덱스 컬럼명 지정
user_similarity = pd.DataFrame(user_similarity,
                                index=ratings_matrix.index,
                                columns=ratings_matrix.index)
########################################################################################
##### Neighbor size를 정해서 예측치를 계산하는 함수 #####
def CF_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)

        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도 계산
score(CF_knn, neighbor_size=30)

1.0108871337570486

In [6]:
#### 실제 주어진 사용자에 대해 추천을 받는 기능 구현 ####
ratings_matrix = x_train.pivot(index='user_id',
                                 columns='movie_id',
                                 values='rating')

matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                                index=ratings_matrix.index,
                                columns=ratings_matrix.index)
def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = ratings_matrix.loc[user_id].copy()
    for movie in ratings_matrix.columns:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0

        else:
            user_movie.loc[movie] = CF_knn(user_id, movie, neighbor_size)


    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=729, n_items=5, neighbor_size=30)

  mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()


movie_id
1293                         Star Kid (1997)
1467    Saint of Fort Washington, The (1993)
1189                      Prefontaine (1997)
1491                 Tough and Deadly (1995)
1466                Margaret's Museum (1995)
Name: title, dtype: object

# 3.5 최적의 이웃 크기 결정

In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

##### 데이터 불러오기 및 데이터셋 만들기 #####
base_src = './Data'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src,
                    sep = '|',
                    names = u_cols,
                    encoding='latin-1')
users = users.set_index('user_id')

u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id', 'title','release date', 'video release date',
            'IMDB URL', 'unknown', 'Action','Adventure','Animation',
            'Children\'s', 'Comedy', 'Crime','Documentary','Drama','Fantasy',
            'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src,
                    sep='|',
                    names=i_cols,
                    encoding='latin-1')
movies = movies.set_index('movie_id')

u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                        sep='\t',
                        names=r_cols,
                        encoding='latin-1')

# RMSE 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

# score(RMSE) 계산
def score(model, neighbor_size=0):      # neighbor_size 지정 
    # 테스트 데이터의 user_id와 movie_id간 pair를 맞춰 튜플형 원소 리스트데이터를 만듦.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에서 대해서 주어진 예측 모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

########################################################################################

ratings_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                                index=ratings_matrix.index,
                                columns=ratings_matrix.index)

def CF_knn(user_id, movie_id, neighbor_size=0):
    # train/test set의 분할에 따라 rating_matrix에 영화가 있는지 확인
    if movie_id in ratings_matrix.columns:
        # 주어진 사용자(user_id)와 다른 사용자의 유사도 추출
        sim_scores = user_similarity[user_id].copy()
        # 주어진 영화(movie_id)와 다른 사용자의 유사도 추출
        movie_ratings = ratings_matrix[movie_id].copy()
        # 주어진 영화에 대해서 평가를 하지 않은 사용자를 가중평균계상에서 제외하기 위해 인덱스 추출
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        # 주어진 영화에 대해서 평가를 하지 않은 사람 제외
        movie_ratings = movie_ratings.dropna()
        # 주어진 영화를 평가하지 않은 사용자와의 유사도를 제거, 가중 평균 계산할때 필요가 없기 때문
        sim_scores = sim_scores.drop(none_rating_idx)


        #### Neighbot size가 지정되지 않은 경우 ####
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        #### Neighbot size가 지정된 경우 ####
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    # train/test set의 분할에 따라 rating_matrix에 해당 영화가 없으면 기본값 3.0예측치로 간주
    else:
        mean_rating = 3.0
    return mean_rating
# neighbor_size가 10, 20, 30, 40, 50, 60인 경우에 대해서 RMSE를 계산하고 이를 출력한다.
for neighbor_size in [10,20,30,40,50,60]:
    print('Neighbor Size = %d : RMSE = %.4f'%(neighbor_size, score(CF_knn, neighbor_size)))

Neighbor Size = 10 : RMSE = 1.0314
Neighbor Size = 20 : RMSE = 1.0165
Neighbor Size = 30 : RMSE = 1.0128
Neighbor Size = 40 : RMSE = 1.0124
Neighbor Size = 50 : RMSE = 1.0128
Neighbor Size = 60 : RMSE = 1.0137


# 3.6 사용자의 평가 경향을 고려한 CF

In [16]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

##### 데이터 불러오기 및 데이터셋 만들기 #####
base_src = './Data'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src,
                    sep = '|',
                    names = u_cols,
                    encoding='latin-1')
users = users.set_index('user_id')

u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id', 'title','release date', 'video release date',
            'IMDB URL', 'unknown', 'Action','Adventure','Animation',
            'Children\'s', 'Comedy', 'Crime','Documentary','Drama','Fantasy',
            'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src,
                    sep='|',
                    names=i_cols,
                    encoding='latin-1')
movies = movies.set_index('movie_id')

u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                        sep='\t',
                        names=r_cols,
                        encoding='latin-1')

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

def score(model, neighbor_size=0):      # neighbor_size 지정 
    # 테스트 데이터의 user_id와 movie_id간 pair를 맞춰 튜플형 원소 리스트데이터를 만듦.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에서 대해서 주어진 예측 모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)
ratings_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                                index=ratings_matrix.index,
                                columns=ratings_matrix.index)
#### 사용자 평가 경향을 고려한 함수 ####
rating_mean = ratings_matrix.mean(axis=1)
rating_bias = (ratings_matrix.T - rating_mean).T # T는 transforms

# 사용자 평가 경향을 고려한 함수
def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)

        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]

            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction

score(CF_knn_bias, 30)

0.9476824815031243

In [14]:
rating_bias

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1671,1672,1674,1675,1677,1678,1679,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.372549,-0.627451,0.372549,-0.627451,-0.627451,,,,1.372549,-0.627451,...,,,,,,,,,,
2,0.255319,,,,,,,,,-1.744681,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.099237,0.099237,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,0.729730,,...,,,,,,,,,,
940,,,,-1.525000,,,0.475,1.475,,,...,,,,,,,,,,
941,1.000000,,,,,,0.000,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


# 3.7 그 외의 CF 정확도 개선 방법

In [28]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

##### 데이터 불러오기 및 데이터셋 만들기 #####
base_src = './Data'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src,
                    sep = '|',
                    names = u_cols,
                    encoding='latin-1')
users = users.set_index('user_id')

u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id', 'title','release date', 'video release date',
            'IMDB URL', 'unknown', 'Action','Adventure','Animation',
            'Children\'s', 'Comedy', 'Crime','Documentary','Drama','Fantasy',
            'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src,
                    sep='|',
                    names=i_cols,
                    encoding='latin-1')
movies = movies.set_index('movie_id')

u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                        sep='\t',
                        names=r_cols,
                        encoding='latin-1')

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

def score(model, neighbor_size=0):      # neighbor_size 지정 
    # 테스트 데이터의 user_id와 movie_id간 pair를 맞춰 튜플형 원소 리스트데이터를 만듦.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에서 대해서 주어진 예측 모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                                index=ratings_matrix.index,
                                columns=ratings_matrix.index)

rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T # T는 transforms

########################################################################################

rating_binary_1 = np.array(rating_matrix > 0).astype(float)
rating_binary_2 = rating_binary_1.T

counts = np.dot(rating_binary_1, rating_binary_2)
counts = pd.DataFrame(counts,
                        index = rating_matrix.index,
                        columns=rating_matrix.index).fillna(0)

def CF_knn_bias_sig(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()

        no_rating = movie_ratings.isnull()              # null값인 것들을 True로 설정
        common_counts = counts[user_id]                 # 공통평가 영화수
        low_significance = common_counts < SIG_LEVEL    # 공통평가 영화수가 미리 정해진 숫자보다 작은 사용자를 TRUE로 표시
        none_rating_idx = movie_ratings[no_rating | low_significance].index

        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)

        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > MIN_RATINGS:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction +rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]

    # 예측 값이 0.6이다. 
    if prediction <= 1:
        prediction = 1
    elif prediction >= 5:
        prediction = 5
    # 5.2 >= 5 계산을 좀더 단순하게.
    return prediction

SIG_LEVEL = 3
MIN_RATINGS = 3

score(CF_knn_bias_sig, 30)

0.9398002913540942

In [24]:
rating_bias

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1672,1674,1677,1678,1679,1680,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,-0.642157,0.357843,-0.642157,-0.642157,1.357843,0.357843,-2.642157,1.357843,-0.642157,...,,,,,,,,,,
2,0.361702,,,,,,,,,-1.638298,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.145038,0.145038,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,0.675676,,...,,,,,,,,,,
940,,,,,,,,1.612500,-0.387500,,...,,,,,,,,,,
941,0.812500,,,,,,-0.187500,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


# 3.8 사용자 기반 CF와 아이템 기반 CF

In [36]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

##### 데이터 불러오기 및 데이터셋 만들기 #####
base_src = './Data'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src,
                    sep = '|',
                    names = u_cols,
                    encoding='latin-1')
users = users.set_index('user_id')

u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id', 'title','release date', 'video release date',
            'IMDB URL', 'unknown', 'Action','Adventure','Animation',
            'Children\'s', 'Comedy', 'Crime','Documentary','Drama','Fantasy',
            'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src,
                    sep='|',
                    names=i_cols,
                    encoding='latin-1')
movies = movies.set_index('movie_id')

u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                        sep='\t',
                        names=r_cols,
                        encoding='latin-1')

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

def score(model):      # neighbor_size 지정 
    # 테스트 데이터의 user_id와 movie_id간 pair를 맞춰 튜플형 원소 리스트데이터를 만듦.
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    # 모든 사용자-영화 짝에서 대해서 주어진 예측 모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

########################################################################################
rating_matrix_t = np.transpose(rating_matrix)

matrix_dummy = rating_matrix_t.copy().fillna(0)

item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity,
                                index = rating_matrix_t.index,
                                columns = rating_matrix_t.index)

def CF_IBCF(user_id, movie_id):
    if movie_id in item_similarity.columns:
        sim_scores = item_similarity[movie_id]
        user_rating = rating_matrix_t[user_id]
        none_rating_idx = user_rating[user_rating.isnull()].index
        user_rating = user_rating.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
    else:
        mean_rating = 3.0
    
    return mean_rating
score(CF_IBCF)

1.0196572278928668

In [32]:
item_similarity

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1674,1676,1677,1679,1680,1681
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.291130,0.260815,0.387949,0.188804,0.120631,0.455862,0.342607,0.361525,0.155987,...,0.0,0.0,0.055995,0.041996,0.0,0.00000,0.000000,0.0,0.0,0.055995
2,0.291130,1.000000,0.220749,0.393555,0.277135,0.045543,0.253821,0.192784,0.183228,0.119749,...,0.0,0.0,0.089642,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.089642
3,0.260815,0.220749,1.000000,0.240068,0.182427,0.080277,0.263677,0.136441,0.209873,0.112720,...,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.036037,0.0,0.0,0.000000
4,0.387949,0.393555,0.240068,1.000000,0.228883,0.058704,0.384913,0.333897,0.313956,0.169524,...,0.0,0.0,0.065310,0.000000,0.0,0.10885,0.043540,0.0,0.0,0.065310
5,0.188804,0.277135,0.182427,0.228883,1.000000,0.047729,0.233474,0.256426,0.190654,0.020761,...,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1676,0.000000,0.000000,0.000000,0.108850,0.000000,0.000000,0.000000,0.077267,0.064889,0.097745,...,0.0,0.0,0.000000,0.000000,0.0,1.00000,0.000000,0.0,0.0,0.000000
1677,0.000000,0.000000,0.036037,0.043540,0.000000,0.000000,0.058887,0.096583,0.081111,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.00000,1.000000,0.0,0.0,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.000000,1.0,1.0,0.000000
1680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.000000,1.0,1.0,0.000000


# 3.9 추천 시스템의 성과측정지표