<a href="https://colab.research.google.com/github/stebechoi/CP2/blob/YJ/DeepFM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from itertools import repeat
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Concatenate, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data_path = '/content/drive/MyDrive/CP2/ml-100k/'
ratings_df = pd.read_csv(data_path + 'u.data', sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
genre_data = pd.read_csv(data_path + 'u.genre', sep='|', names=['genre', 'genre_id'])
item_df = pd.read_csv(data_path + 'u.item', sep='|', encoding='latin-1', header=None,
                        names=['movieId', 'movie_title', 'release_date', 'video_release_date',
                               'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                               'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama',
                               'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                               'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
users_df = pd.read_csv(data_path + 'u.user', sep='|', names = ['userId', 'age', 'gender', 'occupation', 'zip_code'])

ratings_df = ratings_df.drop('timestamp',axis=1)
users_df = users_df.drop('zip_code',axis=1)
item_df = item_df.drop(['unknown','movie_title', 'release_date', 'video_release_date', 'IMDb_URL'], axis=1)

In [5]:
#gender
genders_df = pd.get_dummies(users_df.gender, prefix="gender")
users_df = pd.concat([users_df, genders_df], axis=1)
users_df.drop("gender", axis=1, inplace=True)

def convert_age(x):
  if x < 18:
    return 'under 18'
  elif x>= 18 and x<25:
    return '18-24'
  elif x>=25 and x<35:
    return '25-34'
  elif x>=35 and x<45:
    return '35-44'
  elif x>=45 and x<55:
    return '45-54'
  else:
    return 'over 55'

In [6]:
#genres
genres_df = item_df.iloc[:,1:]

In [7]:
#age 
users_df.age = users_df.age.apply(convert_age)
ages_df = pd.get_dummies(users_df.age)
users_df = pd.concat([users_df, ages_df], axis=1)
users_df.drop('age', axis=1,inplace=True)

In [8]:
#occupation
occupation_df = pd.get_dummies(users_df.occupation)
users_df = pd.concat([users_df, occupation_df], axis=1)
users_df.drop('occupation',axis=1, inplace=True)

In [9]:
ratings_df = ratings_df.merge(users_df, how="left")
ratings_df = ratings_df.merge(item_df, how='left')
ratings_df = ratings_df.astype(float)

In [10]:
#binary target
target = ratings_df['rating']
binary_target = (target>=4.0).astype(float)
ratings_df.drop('rating',axis=1,inplace=True)

In [12]:
#fields
fields = [ratings_df.columns[i] for i in range(ratings_df.shape[1])]
num_fields = len(fields)

field_name = {"userId": ["userId"],
              "movieId": ["movieId"],
              "gender": list(genders_df.columns),
              "age": list(ages_df.columns),
              "occupation": list(occupation_df.columns),
              "genres": list(genres_df.columns)}

In [13]:
#embedding lookup index
field_dict = dict()
embedding_lookup_index = []
for index, field in enumerate(list(field_name.keys())):
  field_dict[index] = field
  embedding_lookup_index.extend(repeat(index, len(field_name[field])))

In [None]:
# #FM part
# class wide_part(keras.layers.Layer):
#     def __init__(self, V, num_fields, embedding_lookup_index, **kwargs):
#         super().__init__(self, **kwargs)
#         self.V = V
#         self.num_fields = num_fields
#         self.embedding_lookup_index = embedding_lookup_index

#     def build(self, input_shape):
#         w_init = tf.random_normal_initializer()

#         self.W = tf.Variable(initial_value=w_init(shape=[input_shape[-1]]),
#                              dtype='float32',name = "W")
#         self.V = tf.Variable(initial_value=w_init(shape=[self.num_fields, self.V]),
#                              dtype="float32",name= "V")

#     def call(self, inputs):
#         #embeds와 (batch_size, num_feature, embedding_size) - feature tensor
#         x_batch = keras.layers.Reshape((inputs.shape[-1], 1))(inputs)
#         # 인덱스에 해당하는 임베딩 벡터를 찾는 과정에서 field_index라능 2차원 벡터를 사용하여 3차원 텐서가 생성됨
#         embeddings_lookup_table = tf.nn.embedding_lookup(params=self.V, ids=self.embedding_lookup_index)
#         # (50,V) --> embedding_lookup_table
#         # x_batch, embeds broadcasting (vx)
#         embedded_fields = tf.math.multiply(x_batch, embeddings_lookup_table)
#         # element-wise after broadcasting to (None,50,1) --> (None,50,V)

#         order_1_output = tf.reduce_sum(tf.math.multiply(inputs, self.W), axis=1)
#         #         elementwise after broadcasting (None,50) x (50) = None,50
#         #         reduce_sum == (None,)

#         embed_sum = tf.reduce_sum(embedded_fields, [1, 2])
#         # (None,50,V) == > (None,)
#         embed_square = tf.square(embedded_fields)
#         # (None,50,V) ==> (None,50,V)
#         square_of_sum = tf.square(embed_sum)
#         # (None,) == > (None,)
#         sum_of_square = tf.reduce_sum(embed_square, [1, 2])
#         # (None,50,V) == > (None, )
#         order_2_output = 0.5 * tf.subtract(square_of_sum, sum_of_square)
#         # (None,) ==> (None,)
#         order_1_output = keras.layers.Reshape([1])(order_1_output)
#         # (None,) ==> (None,1)
#         order_2_output = keras.layers.Reshape([1])(order_2_output)
#         # (None,) ==> (None,1)
#         wide_output = keras.layers.Concatenate(axis=1)([order_1_output, order_2_output])
#         # (None,2)

#         return wide_output, embedded_fields


In [15]:
#FM part
class wide_part(keras.layers.Layer):
    def __init__(self, num_feature, num_fields,embedding_size, embedding_lookup_index):
        super(wide_part,self).__init__()
        self.num_fields = num_fields
        self.embedding_lookup_index = embedding_lookup_index
        self.w = tf.Variable(tf.random.normal(shape=[num_feature],
                                              mean=0.0, stddev=1.0), name='w')
        self.V = tf.Variable(tf.random.normal(shape=(num_fields, embedding_size),
                                              mean=0.0, stddev=0.01), name='V')
    

    def call(self, inputs):
        #embeds와 (batch_size, num_feature, embedding_size) - feature tensor
        x_batch = keras.layers.Reshape((inputs.shape[-1], 1))(inputs)
        # 인덱스에 해당하는 임베딩 벡터를 찾는 과정에서 field_index라능 2차원 벡터를 사용하여 3차원 텐서가 생성됨
        embeddings_lookup_table = tf.nn.embedding_lookup(params=self.V, ids=self.embedding_lookup_index)
        # (50,V) --> embedding_lookup_table
        # x_batch, embeds broadcasting (vx)
        embedded_fields = tf.math.multiply(x_batch, embeddings_lookup_table)
        # element-wise after broadcasting to (None,50,1) --> (None,50,V)

        order_1_output = tf.reduce_sum(tf.math.multiply(inputs, self.w), axis=1)
        #         elementwise after broadcasting (None,50) x (50) = None,50
        #         reduce_sum == (None,)

        embed_sum = tf.reduce_sum(embedded_fields, [1, 2])
        # (None,50,V) == > (None,)
        embed_square = tf.square(embedded_fields)
        # (None,50,V) ==> (None,50,V)
        square_of_sum = tf.square(embed_sum)
        # (None,) == > (None,)
        sum_of_square = tf.reduce_sum(embed_square, [1, 2])
        # (None,50,V) == > (None, )
        order_2_output = 0.5 * tf.subtract(square_of_sum, sum_of_square)
        # (None,) ==> (None,)
        order_1_output = keras.layers.Reshape([1])(order_1_output)
        # (None,) ==> (None,1)
        order_2_output = keras.layers.Reshape([1])(order_2_output)
        # (None,) ==> (None,1)
        wide_output = keras.layers.Concatenate(axis=1)([order_1_output, order_2_output])
        # (None,2)

        return wide_output, embedded_fields

In [None]:
# #Deep part
# class deep_part(keras.layers.Layer):
#     def __init__(self, layer_list=[128, 64, 32], dropout_rate=0.5, activation="relu", **kwargs):
#         super().__init__(**kwargs)
#         self.activaiton_fn = keras.activations.get(activation)
#         self.dropout_rate = dropout_rate
#         self.dense_layer_list = [keras.layers.Dense(num_neuron, activation=self.activaiton_fn, name =f'Dense_{index}') for index,num_neuron in
#                                  enumerate(layer_list)]
#         self.output_layer = keras.layers.Dense(1, activation="relu",name = "deep_output")

#     def call(self, inputs):
#         embed_2d = inputs
#         # (None,50,V)
#         embed_2d = keras.layers.Flatten(name='flat_embed')(embed_2d)
#         # (None,50 * V)
#         result = embed_2d
#         for layer in self.dense_layer_list:
#             result = keras.layers.Dropout(self.dropout_rate)(result)
#             result = layer(result)

#         deep_result = self.output_layer(result)
#         #(None,1)
#         return deep_result

In [None]:
# #DeepFM
# class deep_FM(keras.Model):
#     def __init__(self, V, num_fields, embbeding_lookup_index, layer_list=[128, 64, 32], dropout_rate=0.5,
#                  activation="relu"):
#         super().__init__(**kwargs)
#         self.wide_part = wide_part(V, num_fields, embbeding_lookup_index)
#         self.deep_part = deep_part(layer_list, dropout_rate, activation)
#         self.output_layer = keras.layers.Dense(1, activation="sigmoid",name = "final_output")

#     def call(self, inputs):
#         # inputs = (None,50)
#         wide_output, embeddings = self.wide_part(inputs)
#         deep_output = self.deep_part(embeddings)

#         concat = keras.layers.Concatenate(axis=1)([wide_output, deep_output])
#         wide_deep_output = self.output_layer(concat)
#         return wide_deep_output

In [16]:
#DeepFM
class deep_FM(keras.Model):
    def __init__(self, num_feature, num_fields, embedding_lookup_index, embedding_size):
        super(deep_FM,self).__init__()
        self.embedding_size = embedding_size      # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature            # f: 원래 feature 개수
        self.num_fields = num_fields              # m: grouped field 개수
        self.embedding_lookup_index = embedding_lookup_index 

        self.fm_layer = wide_part(num_feature, num_fields, embedding_size, embedding_lookup_index)

        self.layers1 = tf.keras.layers.Dense(units=64, activation='relu')
        self.dropout1 = tf.keras.layers.Dropout(rate=0.2)
        self.layers2 = tf.keras.layers.Dense(units=16, activation='relu')
        self.dropout2 = tf.keras.layers.Dropout(rate=0.2)
        self.layers3 = tf.keras.layers.Dense(units=2, activation='relu')

        self.final = tf.keras.layers.Dense(units=1, activation='sigmoid')

    def call(self, inputs):
        # inputs = (None,50)
        wide_output, embedded_fields = self.fm_layer(inputs)

        # retrieve Dense Vectors: (num_batch, num_feature*embedding_size)
        embedded_fields = tf.reshape(embedded_fields, [-1, self.num_feature*self.embedding_size])

        # 2) Deep Component
        y_deep = self.layers1(embedded_fields)
        y_deep = self.dropout1(y_deep)
        y_deep = self.layers2(y_deep)
        y_deep = self.dropout2(y_deep)
        y_deep = self.layers3(y_deep)

        # Concatenation
        y_pred = tf.concat([wide_output, y_deep], 1)
        y_pred = self.final(y_pred)
        #[batchsize,1] 에서 [baichsize] 로 차원을 변경
        y_pred = tf.reshape(y_pred, [-1, ])

        return y_pred

In [17]:
x_train,x_test,y_train,y_test = train_test_split(ratings_df,binary_target,test_size= 0.2)

In [18]:
model = deep_FM(num_feature=len(embedding_lookup_index),num_fields = len(field_dict), embedding_size = 5, embedding_lookup_index=embedding_lookup_index)

In [19]:
precision = tf.keras.metrics.Precision(top_k =5)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[precision])

In [20]:
model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa07b354d60>