## Python을 이용한 개인화 추천시스템
저자: 임일

출판사: 청람

### 데이터 읽기

In [1]:
# u.user 파일을 DataFrame으로 읽기
import pandas as pd
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/Users/ms964/OneDrive/문서/GitHub/STUDY/Recommendation_System/My_Study/Python을 이용한 개인화 추천시스템/Data/u.user', sep = '|', names = u_cols, encoding='latin-1')
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [2]:
# u.item 파일을 DataFrame으로 읽기
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 'Action', 'Adventure', 
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 
'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/Users/ms964/OneDrive/문서/GitHub/STUDY/Recommendation_System/My_Study/Python을 이용한 개인화 추천시스템/Data/u.item', 
sep = '|', names = i_cols, encoding = 'latin-1')
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
# u.data 파일을 DataFrame으로 읽기
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/Users/ms964/OneDrive/문서/GitHub/STUDY/Recommendation_System/My_Study/Python을 이용한 개인화 추천시스템/Data/u.data',
 sep = '\t', names = r_cols, encoding = 'latin-1')
ratings.drop('timestamp', axis = 1, inplace=True)
movies = movies[['movie_id', 'title']]

ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


### 사용자 집단별 추천
- 비슷한 특성의 사람들을 묶은 소집단으로 만든 다음에 각 집단의 평점 평균을 바탕으로 추천하는 것

In [4]:
# train, test set 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, stratify = y)

In [5]:
# RMSE
import numpy as np
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# train 데이터로 Full matrix 구하기
rating_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')

### 전체 평균으로 예측치를 계산하는 기본 모델

In [6]:
def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

train_mean = x_train.groupby(['movie_id'])['rating'].mean()
score(best_seller)

1.0223598035470736

In [7]:
# Full matrix를 사용자 데이터와 merge
merged_ratings = pd.merge(x_train, users)
users = users.set_index('user_id')
# gender별 평점평균 계산
g_mean = merged_ratings[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

# gender기준 추천

def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    return gender_rating

score(cf_gender)

1.0310459043825386

### train set의 모든 가능한 사용자 pair의 Cosine similarities 계산

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, columns = rating_matrix.index)

# 주어진 영화의 가중평균 rating 계산: 가중치로 사용자와 다른 사용자와의 유사도(user_similarity)를 사용한다.
def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating

# score(CF_simple)

# Pearson correlation coefficient로 예측하기

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.126945,0.044690,0.033718,0.259750,0.365391,0.329686,0.209300,0.057143,0.273881,...,0.308651,0.131980,0.227988,0.114826,0.192550,0.065393,0.278568,0.158990,0.145366,0.326977
2,0.126945,1.000000,0.106481,0.127885,0.000000,0.196967,0.067861,0.132953,0.200063,0.122286,...,0.098520,0.105016,0.222971,0.320495,0.188805,0.195971,0.164562,0.086736,0.108669,0.086225
3,0.044690,0.106481,1.000000,0.291508,0.000000,0.065194,0.055199,0.076330,0.000000,0.034258,...,0.032586,0.056071,0.130055,0.086082,0.133233,0.000000,0.155331,0.083122,0.134585,0.034283
4,0.033718,0.127885,0.291508,1.000000,0.000000,0.049764,0.074689,0.189553,0.129524,0.042745,...,0.054188,0.048348,0.090576,0.202434,0.102979,0.000000,0.150984,0.000000,0.068082,0.060812
5,0.259750,0.000000,0.000000,0.000000,1.000000,0.204742,0.273048,0.190505,0.037528,0.163662,...,0.286372,0.027736,0.020245,0.052083,0.076228,0.033460,0.182884,0.116944,0.135570,0.199540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.065393,0.195971,0.000000,0.000000,0.033460,0.115047,0.066453,0.072842,0.053092,0.081538,...,0.039093,0.290729,0.155616,0.029872,0.284313,1.000000,0.065881,0.079323,0.022833,0.150496
940,0.278568,0.164562,0.155331,0.150984,0.182884,0.272205,0.250235,0.201721,0.142729,0.257870,...,0.215828,0.086655,0.115959,0.154806,0.137933,0.065881,1.000000,0.067901,0.162699,0.173332
941,0.158990,0.086736,0.083122,0.000000,0.116944,0.172509,0.046353,0.140614,0.081341,0.018856,...,0.049003,0.131165,0.194534,0.204420,0.259098,0.079323,0.067901,1.000000,0.062966,0.107408
942,0.145366,0.108669,0.134585,0.068082,0.135570,0.255656,0.218482,0.146160,0.059002,0.156131,...,0.154962,0.020647,0.017683,0.079919,0.050126,0.022833,0.162699,0.062966,1.000000,0.142115
