In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.layers import Embedding, Input, Flatten, Multiply, Concatenate, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

In [2]:
import os
data_path = '../../../data/ml-latest-small/'

# 무비렌즈 데이터셋 로드
ratings_df = pd.read_csv(data_path+'ratings.csv')

In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movie_enc = LabelEncoder()
ratings_df['movieId'] = movie_enc.fit_transform(ratings_df['movieId'])

In [5]:
# LabelEncoder를 통해 변환된 movieId 컬럼을 categorical and ordianl컬럼으로 변경 
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,0,4.0,964982703
1,1,2,4.0,964981247
2,1,5,4.0,964982224
3,1,43,5.0,964983815
4,1,46,5.0,964982931


In [6]:
# ratings_df 데이터프레임의 userId 컬럼에서 유니크한 값의 개수를 추출
n_users = ratings_df.userId.nunique()

In [7]:
n_users

610

In [8]:
n_movies = ratings_df.movieId.nunique()

In [9]:
n_movies

9724

In [10]:
ratings_df.rating.unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [11]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,0,4.0,964982703
1,1,2,4.0,964981247
2,1,5,4.0,964982224
3,1,43,5.0,964983815
4,1,46,5.0,964982931
...,...,...,...,...
100831,610,9416,4.0,1493848402
100832,610,9443,5.0,1493850091
100833,610,9444,5.0,1494273047
100834,610,9445,5.0,1493846352


In [12]:
ratings_df['rating'] = [1 if x>=4 else 0 for x in ratings_df['rating']]

In [13]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,0,1,964982703
1,1,2,1,964981247
2,1,5,1,964982224
3,1,43,1,964983815
4,1,46,1,964982931
...,...,...,...,...
100831,610,9416,1,1493848402
100832,610,9443,1,1493850091
100833,610,9444,1,1494273047
100834,610,9445,1,1493846352


In [14]:
# hyper parameters
latent_dim_gmf = 64  # General Matrix Factorization
latent_dim_mlp = 64  # Multi Layer Percetron
num_hidden_layers = 5

In [15]:
# Input layers
# user, movie 모두 각각 하나의 토큰씩만을 받기 때문에 shape=(1,)
user_input = Input(shape=(1,), dtype='int32', name='user_input')
movie_input = Input(shape=(1,), dtype='int32', name='movie_input')

In [16]:
# Embedding Layers
embedding_user_gmf = Embedding(input_dim=n_users, output_dim=latent_dim_gmf,
                              name='user_embedding_gmf', input_length=1)(user_input)

embedding_movie_gmf = Embedding(input_dim=n_movies, output_dim=latent_dim_gmf,
                               name='movie_embedding_gmf', input_length=1)(movie_input)

In [17]:
embedding_user_gmf

<KerasTensor: shape=(None, 1, 64) dtype=float32 (created by layer 'user_embedding_gmf')>

In [18]:
embedding_user_mlp = Embedding(input_dim=n_users, output_dim=latent_dim_mlp,
                              name='user_embedding_mlp', input_length=1)(user_input)
embedding_movie_mlp = Embedding(input_dim=n_movies, output_dim=latent_dim_mlp,
                               name='movie_embedding_mlp', input_length=1)(movie_input)

In [19]:
#Flatten embeddings
user_latent_gmf = Flatten()(embedding_user_gmf)
movie_latent_gmf = Flatten()(embedding_movie_gmf)
user_latent_mlp = Flatten()(embedding_user_mlp)
movie_latent_mlp = Flatten()(embedding_movie_mlp)

In [20]:
user_latent_gmf

<KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'flatten')>

In [23]:
movie_latent_gmf

<KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'flatten_1')>

In [21]:
# GMF part, element-wise product
gmf_vector = Multiply()([user_latent_gmf, movie_latent_gmf])

In [22]:
gmf_vector

<KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'multiply')>

In [26]:
# MLP part
concat_vector_mlp = Concatenate()([user_latent_mlp, movie_latent_mlp])  # 128 dim
mlp_vector = Dense(64, activation='relu')(concat_vector_mlp)  # 64 dim

In [28]:
concat_vector_mlp

<KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'concatenate')>

In [27]:
mlp_vector

<KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'dense')>

In [29]:
for _ in range(num_hidden_layers - 1):
    mlp_vector = Dense(64, activation='relu')(mlp_vector)  # 반복하여 히든 레이어를 쌓아준다.

In [30]:
mlp_vector

<KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'dense_4')>

In [31]:
# NeuMF part
neumf_vector = Concatenate()([gmf_vector, mlp_vector])
output = Dense(1, activation='sigmoid')(neumf_vector)

In [32]:
model = Model([user_input, movie_input], output)

In [33]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 movie_input (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 user_embedding_mlp (Embedding)  (None, 1, 64)       39040       ['user_input[0][0]']             
                                                                                                  
 movie_embedding_mlp (Embedding  (None, 1, 64)       622336      ['movie_input[0][0]']            
 )                                                                                            

In [34]:
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])

In [36]:
tf.keras.utils.plot_model(model)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [37]:
# 0부터 시작하도록 reindexing
user_input = ratings_df['userId'].values - 1
movie_input = ratings_df['movieId'].values
labels = ratings_df['rating'].values

In [38]:
user_input_train, user_input_test, movie_input_train, movie_input_test, y_train, y_test = train_test_split(user_input, movie_input, labels, test_size=0.2, random_state=42)

In [39]:
# 모델 학습
history = model.fit([user_input_train, movie_input_train], y_train, validation_data=([user_input_test, movie_input_test], y_test), epochs=5, verbose=1)

Epoch 1/5


2023-10-30 01:30:00.083738: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [40]:
history

<keras.callbacks.History at 0x16b952cb0>

In [42]:
# prediction 만들기
y_pred = model.predict([user_input_test, movie_input_test])



In [43]:
# AUC score 계산
auc_score = roc_auc_score(y_test, y_pred)
print('AUC score : ', auc_score)

AUC score :  0.7419294178076211
