In [1]:
import numpy as np
import pandas as pd

# Embedding the tags

## Load tags

In [2]:
tags_df = pd.read_table("ml-10M100K/tags.dat", sep="::", names=["userId", "movieId", "tag", "timestamp"])

  tags_df = pd.read_table("ml-10M100K/tags.dat", sep="::", names=["userId", "movieId", "tag", "timestamp"])


In [3]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835


In [4]:
tags_df.drop(["userId", "timestamp"], axis=1, inplace=True)

In [5]:
tags_df.head()

Unnamed: 0,movieId,tag
0,4973,excellent!
1,1747,politics
2,1747,satire
3,2424,chick flick 212
4,2424,hanks


In [6]:
tags_df["tag"] = tags_df["tag"].astype(str)
tags_df["tag"] = tags_df.groupby("movieId")["tag"].transform(lambda x: ' '.join(x))
tags_df.drop_duplicates(inplace=True)

In [7]:
tags_df.sort_values(by="movieId", axis=0, inplace=True)
tags_df.reset_index(drop=True, inplace=True)
tags_df.head()

Unnamed: 0,movieId,tag
0,1,Pixar Pixar Pixar animation Pixar animated fun...
1,2,For children game animals Joe Johnston Robin W...
2,3,Funniest Movies comedinha de velhinhos engraÃ§...
3,4,girl movie
4,5,steve martin pregnancy remake steve martin fam...


In [8]:
from sklearn.model_selection import train_test_split
tags_train_df, tags_test_df = train_test_split(tags_df, test_size=0.1)

In [9]:
tags_test_df.shape

(761, 2)

## Vectorizing the Tags

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_train_tags = vectorizer.fit_transform(tags_train_df["tag"])

In [12]:
X_train_tags.shape

(6840, 11352)

## Auto Encoder

In [13]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, InputLayer, Dropout, BatchNormalization
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [16]:
tags_auto_encoder = keras.models.Sequential()
hidden_units = [3000, 1000, 500, 200, 500, 1000, 3000]
tags_auto_encoder.add(InputLayer(X_train_tags.shape[1]))
tags_auto_encoder.add(BatchNormalization())
tags_auto_encoder.add(Dropout(0.2))
for u in hidden_units:
    tags_auto_encoder.add(Dense(u, activation="relu"))
    tags_auto_encoder.add(BatchNormalization())
    tags_auto_encoder.add(Dropout(0.3))
tags_auto_encoder.add(Dense(X_train_tags.shape[1]))

In [17]:
tags_auto_encoder.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_10 (Batc (None, 11352)             45408     
_________________________________________________________________
dropout_10 (Dropout)         (None, 11352)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 3000)              34059000  
_________________________________________________________________
batch_normalization_11 (Batc (None, 3000)              12000     
_________________________________________________________________
dropout_11 (Dropout)         (None, 3000)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 1000)              3001000   
_________________________________________________________________
batch_normalization_12 (Batc (None, 1000)             

In [18]:
tags_auto_encoder.compile(optimizer=keras.optimizers.Nadam(), loss=keras.losses.MSE, metrics=[keras.metrics.mse])

In [19]:
tags_auto_encoder.save_weights("init_weights_tags_auto_encoder.h5")


In [20]:
from livelossplot import PlotLossesKerasTF
import tensorboard

In [21]:
import time
import os
def get_board_path(name: str = ""):
    return os.path.join(".", "tensorboard", time.strftime("%Y-%m-%d_%H:%M:%S", time.gmtime())+"_"+ name)

In [22]:
# loss_plot_cb = PlotLossesKerasTF()
early_cb = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
tensorboard_cb = keras.callbacks.TensorBoard(log_dir=get_board_path("tags_ae"))

In [23]:
tags_auto_encoder.fit(x=X_train_tags.todense(), y=X_train_tags.todense(), batch_size=16, epochs=30, callbacks=[early_cb, tensorboard_cb], validation_split=0.07, shuffle=True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f0180f59070>

In [28]:
X_test = vectorizer.transform(tags_test_df["tag"])
tags_auto_encoder.evaluate(X_test.todense(), X_test.todense())



[8.19939814391546e-05, 8.19939814391546e-05]

In [29]:
tags_auto_encoder.save("movielens10_tags_embedding/auto_encoder")

INFO:tensorflow:Assets written to: movielens10_tags_embedding/auto_encoder/assets


In [30]:
tags_df.to_pickle("movielens10_tags_embedding/tags_all.pkl")
tags_test_df.to_pickle("movielens10_tags_embedding/tags_test.pkl")
tags_train_df.to_pickle("movielens10_tags_embedding/tags_train.pkl")

In [31]:
from joblib import dump, load
dump(vectorizer, "movielens10_tags_embedding/vectorizer.joblib")


['movielens10_tags_embedding/vectorizer.joblib']