### 회원이 평가한 점수를 바탕으로  [영화별/성별 평균 점수]와의 오차를 산출하여 [영화별/성별 평균 점수]가 얼마나 대표성이 있는지 확인
- 1. 훈련 데이터를 산출하여 영화별/성별 평점을 산출함
- 2. 검증데이터에서 회원의 실제 평점을 산출 
- 3. 검증데이터에서 회원의 실제 평점과 훈련데이터에서 산출한 영화별 평점과의 오차를 계산하며, 0에 가까울수록 좋은 값

In [115]:
# 분석 절차(PDCNLDNSAER)
# 1. Package import
# 2. Data loading
# 3. Column select: 구조 파악 및 필요한 컬럼 선별
# 4. NaN: 결측치 처리
# 5. Label encoding: 범주형 변수의 변환
# 6. Derivative variable: 파생 변수 만들기
# 7. Normal: 정규화
# 8. Split: 데이터 분할
# 9. Analysis: 분석, 모델 제작
# 10. Evaluation: 평가
# 11. Result save: 결과 저장, 모델 저장

In [116]:
# NPTLSMRRXX
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
# from xgboost import XGBClassifier
# from xgboost import XGBRegressor

In [117]:
# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('./data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('./data/u.item', sep='|', names=i_cols, encoding='utf-8')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./data/u.data', sep='\t', names=r_cols, encoding='utf-8')

In [118]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   gender      943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB


In [119]:
ratings.head(3)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [120]:
ratings.describe() # timestamp ?

Unnamed: 0,user_id,movie_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [121]:
# timestamp 삭제
ratings = ratings.drop(columns=['timestamp'], axis=1)
ratings.head(3)

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1


In [122]:
movies.head(3)

Unnamed: 0,movie_id,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [123]:
# movie_id, title 컬럼만 사용
movies = movies[['movie_id', 'title']]
movies.head(3)

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)


In [124]:
# 8. Split: 데이터 분할
# 독립 변수(요인 변수, 원인 변수)
x = ratings.copy()
print(x.head())

# 종속 변수(결과 변수)
y = ratings['user_id']
print(y.head())

   user_id  movie_id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1
0    196
1    186
2     22
3    244
4    166
Name: user_id, dtype: int64


In [125]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25,
                                                  stratify=y, random_state=0)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(75000, 3)
(25000, 3)
(75000,)
(25000,)


In [126]:
x_train.head()

Unnamed: 0,user_id,movie_id,rating
93435,472,419,4
95019,866,303,4
80776,829,294,2
75349,924,632,4
78986,854,188,4


In [127]:
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [128]:
users[users['user_id'] == 472]

Unnamed: 0,user_id,age,gender,occupation,zip_code
471,472,24,M,student,87544


In [129]:
merged_ratings = pd.merge(x_train, users, on='user_id')
merged_ratings.head()

Unnamed: 0,user_id,movie_id,rating,age,gender,occupation,zip_code
0,472,419,4,24,M,student,87544
1,472,27,4,24,M,student,87544
2,472,175,5,24,M,student,87544
3,472,121,5,24,M,student,87544
4,472,568,5,24,M,student,87544


In [130]:
# 훈련: 영화별 평점만 산출하는 간단한 모델
# 훈련 데이터의 영화별 rating 평점
train_mean = merged_rating[['movie_id', 'gender', 'rating']].groupby(['movie_id', 'gender'])['rating'].mean()
train_mean.head() # 멀티 index 'movie_id', 'gender' 사용

movie_id  gender
1         F         3.752941
          M         3.872428
2         F         3.352941
          M         3.170732
3         F         2.636364
Name: rating, dtype: float64

In [131]:
rating_matrix = x_train.pivot(index='user_id', columns=['movie_id'], values='rating')
rating_matrix.head(3)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1667,1669,1670,1672,1673,1675,1678,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [132]:
# user_id  movie_id  rating  age gender occupation zip_code
print('user_id' in merged_ratings) # DataFrame에 컬럼이 존재하면 True
print('none' in merged_ratings)

True
False


In [133]:
print(train_mean[1]) # movie_id: 1
print('-' * 30)
print('F' in train_mean[1]) # index에 'F'가 있으면 True
print('M' in train_mean[1])
print('A' in train_mean[1])
print('-' * 30)
print(train_mean[1]['F'])
print(train_mean[1]['M'])

gender
F    3.752941
M    3.872428
Name: rating, dtype: float64
------------------------------
True
True
False
------------------------------
3.7529411764705882
3.8724279835390947


In [134]:
users.head() # user_id가 index로 안되어 있어 index로 지정

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [135]:
users = users.set_index('user_id') # user_id가 index로 안되어 있어 index로 지정
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [136]:
# movie_id / gender별 평균을 예측치로 돌려주는 함수 
def movie_id_gender_mean(user_id, movie_id):
    if movie_id in rating_matrix: # DataFrame에 movie_id 변수의 값이 컬럼으로 존재하면 True
        gender = users.loc[user_id]['gender'] # user_id index로 성별 추출
        if gender in train_mean[movie_id]:    # 영화에 해당 성별이 존재하면 True
            gender_rating = train_mean[movie_id][gender] # 영화와 성별에 해당하는 평점을 선택
        else:
            gender_rating = 3.0  # 해당하는 영화가 없으면 기본값 3.0 할당
    else:  # 평점이 없는 영화면 기본값 3.0을 할당
        gender_rating = 3.0
        
    return gender_rating

print(movie_id_gender_mean(1, 1)) # user_id, movie_id
print(movie_id_gender_mean(2, 1))

3.8724279835390947
3.7529411764705882


In [137]:
# 정확도 계산(RMSE)
def rmse(y_real, y_pred): # 실제값, 예측값, 오차 계산
    y_real = np.array(y_real)
    y_pred = np.array(y_pred)
    
    return np.sqrt(np.mean((y_real - y_pred) ** 2))

In [138]:
# 모델별 RMSE를 계산하는 함수 , x_val은 사전에 준비가 되어 있어야함.
def score(model):
    y_real = np.array(x_val['rating']) # 회원이 부여한 실제 평가 점수 25,000 건
    
    id_pairs = zip(x_val['user_id'], x_val['movie_id'])
    # model인 train_mean에서 검증 데이터의 user_id, movie_id를 이용하여 평점을 예측
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs]) 
    # user_id는 사용하지 않은채 movie_id를 전달받아 평점을 산출함

    # 75,000건에 대한 movie_id별 평점에서 검증 데이터 건수 만큼
    # movie_id별 평균 평점을 추출하였음으로 25,000 이 추출됨.
    # y_pred = np.array([model(movie) for movie in x_val['movie_id']]) 
    print('y_pred.shape:', y_pred.shape) # 25000
        
    # 검증 데이터 회원이 부여한 평가 점수, 영화별 평점의 오차 계산
    return rmse(y_real, y_pred) 

score(movie_id_gender_mean)
# 모델 1: 영화별 전체 평점 평균, 0.996007224010567
# 모델 2: 데이터를 분리한 경우, 1.0266715908206363
# 모델 3: movie_id, gender별 평균: 1.0331474849241846
# 영화 id별 gender별 분리하여 오차를 계산하면 개선될것이라는 가정은 실패함.

y_pred.shape: (25000,)


1.0331474849241846

In [None]:
# occupation 별 오차 산출
# gender, accupation 별 오차 산출