In [40]:
# Created or modified on May 2022
# Author: 임일
# 협업필터링(CF) 추천 - Item-based CF

import numpy as np
import pandas as pd

# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

# train, test 데이터 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

# 정확도(RMSE)를 계산하는 함수 
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    print("예측", y_pred)
    y_true = np.array(x_test['rating'])
    print("실제", y_true)
    return RMSE(y_true, y_pred)

# train 데이터로 Full matrix 구하기  
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

##### (1)

# train set의 모든 가능한 아이템 pair의 Cosine similarities 계산
from sklearn.metrics.pairwise import cosine_similarity
rating_matrix_t = np.transpose(rating_matrix)
rating_matrix_t

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,4.0,,,4.0,,,,,4.0,...,2.0,3.0,,,4.0,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,4.0,,,,,,,
4,3.0,,,,,,5.0,,,4.0,...,,,,,,,2.0,,,
5,3.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,,,,,,,,,,,...,,,,,,,,,,
1678,,,,,,,,,,,...,,,,,,,,,,
1679,,,,,,,,,,,...,,,,,,,,,,
1680,,,,,,,,,,,...,,,,,,,,,,


In [41]:
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity, index=rating_matrix_t.index, columns=rating_matrix_t.index)
item_similarity

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1675,1676,1677,1678,1679,1680,1681
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.251944,0.231045,0.325678,0.198347,0.065793,0.446585,0.359255,0.344677,0.196717,...,0.0,0.056155,0.0,0.0,0.0,0.042116,0.0,0.0,0.0,0.056155
2,0.251944,1.000000,0.169848,0.400007,0.229929,0.063082,0.294967,0.204108,0.188761,0.112918,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
3,0.231045,0.169848,1.000000,0.226514,0.169398,0.120705,0.256107,0.114273,0.209995,0.096161,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
4,0.325678,0.400007,0.226514,1.000000,0.257015,0.072492,0.393013,0.384134,0.309758,0.204237,...,0.0,0.063500,0.0,0.0,0.0,0.042333,0.0,0.0,0.0,0.063500
5,0.198347,0.229929,0.169398,0.257015,1.000000,0.033776,0.261630,0.232430,0.201826,0.055278,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.042116,0.000000,0.000000,0.042333,0.000000,0.000000,0.058665,0.095416,0.083624,0.000000,...,0.0,0.000000,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.000000
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,1.0,1.0,1.0,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,1.0,1.0,1.0,0.000000
1680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,1.0,1.0,1.0,0.000000


In [42]:

# 주어진 영화의 (movie_id) 가중평균 rating을 계산하는 함수, 
# 가중치는 주어진 아이템과 다른 아이템 간의 유사도(item_similarity)
def CF_IBCF(user_id, movie_id):
    if movie_id in item_similarity:      # 현재 영화가 train set에 있는지 확인
        # 현재 영화와 다른 영화의 similarity 값 가져오기
        sim_scores = item_similarity[movie_id]
        # 현 사용자의 모든 rating 값 가져오기
        user_rating = rating_matrix_t[user_id]
        # 사용자가 평가하지 않은 영화 index 가져오기
        non_rating_idx = user_rating[user_rating.isnull()].index
        # 사용자가 평가하지 않은 영화 제거
        user_rating = user_rating.dropna()
        # 사용자가 평가하지 않은 영화의 similarity 값 제거
        sim_scores = sim_scores.drop(non_rating_idx)
        # 현 영화에 대한 예상 rating 계산, 가중치는 현 영화와 사용자가 평가한 영화의 유사도
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도 계산
#score(CF_IBCF)

In [43]:
def recommender(user, n_items):
    # 현재 사용자의 모든 아이템에 대한 예상 평점 계산
    predictions = []
    rated_index = rating_matrix.loc[user][rating_matrix.loc[user] > 0].index    # 이미 평가한 영화 확인
    items = rating_matrix.loc[user].drop(rated_index)
    for item in items.index:
        predictions.append(CF_IBCF(user, item))                   # 예상평점 계산
    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    recommendations = recommendations.sort_values(ascending=False)[:n_items]    # 예상평점이 가장 높은 영화 선택
    recommended_items = movies.loc[recommendations.index]['title']
    return recommended_items

recommender(user=2, n_items=50)

movie_id
1593                                       Everest (1998)
1537                                   All Over Me (1997)
1122                    Last Time I Saw Paris, The (1954)
1555                                 Condition Red (1995)
1358                                Boys in Venice (1996)
711                                        Tin Men (1987)
1201          Maybe, Maybe Not (Bewegte Mann, Der) (1994)
1420                  My Crazy Life (Mi vida loca) (1993)
1548                                     Dream Man (1995)
1530    Far From Home: The Adventures of Yellow Dog (1...
1519                                     Fear, The (1995)
1564                                         Daens (1992)
1580                        Woman in Question, The (1950)
1579                                      Liebelei (1933)
1577                           Collectionneuse, La (1967)
1576    Death in the Garden (Mort en ce jardin, La) (1...
1559                 Clean Slate (Coup de Torchon) (1981)
1574 

In [45]:
IBCF_items=recommender(user=2, n_items=50)
IBCF_items

movie_id
1593                                       Everest (1998)
1537                                   All Over Me (1997)
1122                    Last Time I Saw Paris, The (1954)
1555                                 Condition Red (1995)
1358                                Boys in Venice (1996)
711                                        Tin Men (1987)
1201          Maybe, Maybe Not (Bewegte Mann, Der) (1994)
1420                  My Crazy Life (Mi vida loca) (1993)
1548                                     Dream Man (1995)
1530    Far From Home: The Adventures of Yellow Dog (1...
1519                                     Fear, The (1995)
1564                                         Daens (1992)
1580                        Woman in Question, The (1950)
1579                                      Liebelei (1933)
1577                           Collectionneuse, La (1967)
1576    Death in the Garden (Mort en ce jardin, La) (1...
1559                 Clean Slate (Coup de Torchon) (1981)
1574 

In [46]:
IBCF_items = pd.DataFrame(IBCF_items)
IBCF_items

Unnamed: 0_level_0,title
movie_id,Unnamed: 1_level_1
1593,Everest (1998)
1537,All Over Me (1997)
1122,"Last Time I Saw Paris, The (1954)"
1555,Condition Red (1995)
1358,Boys in Venice (1996)
711,Tin Men (1987)
1201,"Maybe, Maybe Not (Bewegte Mann, Der) (1994)"
1420,My Crazy Life (Mi vida loca) (1993)
1548,Dream Man (1995)
1530,Far From Home: The Adventures of Yellow Dog (1...


In [47]:
IBCF_items.to_csv('IBCF 평점순 정렬 5점~1점.csv', encoding='latin-1')

In [None]:

IBCF_items = pd.DataFrame(IBCF_items)
IBCF_items

In [48]:
IBCF = pd.read_csv('IBCF 평점순 정렬 5점~1점.csv', encoding='latin-1')
IBCF=IBCF["movie_id"]
IBCF_list = IBCF.values.tolist()
len(IBCF_list)


50

In [49]:
UBCF = pd.read_csv('UBCF 평점순 정렬 5점~1점.csv', encoding='latin-1')
UBCF = UBCF["movie_id"]
UBCF_list = UBCF.values.tolist()
len(UBCF_list)

50

In [50]:
# IBCF_list와 UBCF_list의 교집합 구하기
intersection = list(set(IBCF_list) & set(UBCF_list))
len(intersection)




5