In [1]:
import pandas as pd
import numpy as np
import gzip
import json

## Loading the Data

In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [3]:
df = getDF("Digital_Music_5.json.gz")

In [4]:
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,{'Format:': ' Audio CD'},Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600,
1,5.0,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,{'Format:': ' Audio CD'},Ad,bien,Five Stars,1412985600,
2,5.0,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,{'Format:': ' Audio CD'},JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1392076800,
3,4.0,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,{'Format:': ' Audio CD'},john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1386374400,
4,5.0,,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,{'Format:': ' Audio CD'},melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1465689600,


In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df.drop(df.index[50000:], inplace=True)

In [6]:
df.shape

(40000, 12)

## Cleaning the Data

In [7]:
df.drop(["verified", "reviewTime", "reviewerID", "reviewerName", "unixReviewTime", "image", "style"], axis=1, inplace=True)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
df["summary"].fillna("", inplace=True)
df["reviewText"].fillna("", inplace=True)
df["vote"].fillna(0, inplace=True)


In [10]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.1)

In [11]:
train_df.shape

(36000, 5)

## Vectorizing the Summaries

In [12]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X = vectorizer.fit_transform(train_df['summary'])

In [13]:
X.shape


(36000, 7799)

## Auto-Encoder Model

In [14]:
import tensorflow as tf
import tensorflow.keras as keras

In [15]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [16]:
import time
import os
def get_board_path(name: str = ""):
    return os.path.join(".", "tensorboard", time.strftime("%Y-%m-%d_%H:%M:%S", time.gmtime())+"_"+ name)

In [17]:
hidden_units=[3000, 1500, 700, 300, 700, 1500, 3000]
summaries_auto_encoder = keras.models.Sequential()
summaries_auto_encoder.add(keras.layers.InputLayer(X.shape[1]))
summaries_auto_encoder.add(keras.layers.BatchNormalization())
summaries_auto_encoder.add(keras.layers.Dropout(0.2))
for u in hidden_units:
  summaries_auto_encoder.add(keras.layers.Dense(u, activation="relu"))
  summaries_auto_encoder.add(keras.layers.BatchNormalization())
  summaries_auto_encoder.add(keras.layers.Dropout(0.3))
summaries_auto_encoder.add(keras.layers.Dense(X.shape[1], activation="relu"))

In [18]:
summaries_auto_encoder.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization (BatchNo (None, 7799)              31196     
_________________________________________________________________
dropout (Dropout)            (None, 7799)              0         
_________________________________________________________________
dense (Dense)                (None, 2000)              15600000  
_________________________________________________________________
batch_normalization_1 (Batch (None, 2000)              8000      
_________________________________________________________________
dropout_1 (Dropout)          (None, 2000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              2001000   
_________________________________________________________________
batch_normalization_2 (Batch (None, 1000)              4

In [19]:
summaries_auto_encoder.compile(optimizer=keras.optimizers.Nadam(), loss=keras.losses.MSE, metrics=[keras.metrics.mse])

In [20]:
summaries_auto_encoder.save_weights("init_weights_summary_auto_encoder.h5")


In [21]:
early_cb = keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True)
tensorboard_cb = keras.callbacks.TensorBoard(log_dir=get_board_path())
summaries_auto_encoder.fit(x=X.todense(), y=X.todense(), batch_size=16, epochs=5, callbacks=[early_cb, tensorboard_cb], validation_split=0.1, shuffle=True)

Epoch 1/5
Epoch 2/5


<tensorflow.python.keras.callbacks.History at 0x7f36ba34beb0>