In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv


# Rating prediction
Summary: predict how many stars a rating would get based on the review 

## get and preprocess data


In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
# change range of rating from 1-5 to 0-4, its easier for the algo
data['Rating'] = data['Rating'] - 1

In [5]:
import tensorflow as tf

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((data.Review, data.Rating))

In [7]:
for X_batch, y_batch in dataset.batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Rating", label)
        print()

Review: nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room roo ...
Rating 3

Review: ok nothing special charge diamond member hilton decided chain shot 20th anniversary seattle, start booked suite paid extra website description not, suite bedroom bathroom standard hotel room, took pri ...
Rating 1



In [8]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [9]:
preprocess(X_batch, y_batch)


(<tf.Tensor: shape=(2, 46), dtype=string, numpy=
 array([[b'nice', b'hotel', b'expensive', b'parking', b'got', b'good',
         b'deal', b'stay', b'hotel', b'anniversary', b'arrived', b'late',
         b'evening', b'took', b'advice', b'previous', b'reviews', b'did',
         b'valet', b'parking', b'check', b'quick', b'easy', b'little',
         b'disappointed', b'non', b'existent', b'view', b'room', b'room',
         b'clean', b'nice', b'size', b'bed', b'comfortable', b'woke',
         b'stiff', b'neck', b'high', b'pillows', b'not', b'soundproof',
         b'like', b'heard', b'music', b'roo'],
        [b'ok', b'nothing', b'special', b'charge', b'diamond', b'member',
         b'hilton', b'decided', b'chain', b'shot', b'th', b'anniversary',
         b'seattle', b'start', b'booked', b'suite', b'paid', b'extra',
         b'website', b'description', b'not', b'suite', b'bedroom',
         b'bathroom', b'standard', b'hotel', b'room', b'took', b'printed',
         b'reservation', b'desk', b's

In [10]:
from collections import Counter

# get a vocabulary based on the data
vocabulary = Counter()
for X_batch, y_batch in dataset.batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [11]:
vocabulary.most_common()[:3]


[(b'<pad>', 164657), (b'hotel', 28283), (b'room', 17288)]

In [12]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [13]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [14]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = dataset.repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [15]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[ 11   1 199 ...   0   0   0]
 [119 138 262 ...   0   0   0]
 [ 11  10   4 ...   0   0   0]
 ...
 [  9   1   4 ...   0   0   0]
 [  9 152 328 ...   0   0   0]
 [  3   7 199 ...   0   0   0]], shape=(32, 49), dtype=int64)
tf.Tensor([3 1 2 4 4 4 4 3 4 4 1 3 3 2 3 0 1 4 4 2 4 4 3 4 1 2 3 2 3 3 3 3], shape=(32,), dtype=int64)


## Train Model

### Model 1 (Custom GRU)

model one is a non pre trained GRU model that needs a solid amount of preprocessing to be used, but you get good results.

In [17]:
from tensorflow import keras

embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(5, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, steps_per_epoch=len(data) // 32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Model 2 (Pre-trained Model)

This model doesnt need as much preprocessing but the performance is not as good as the custom model

In [18]:
import tensorflow_hub as hub

model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                   dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(5, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam",
              metrics=["accuracy"])

In [None]:
import tensorflow_datasets as tfds


X_train = data['Review']
y_train = data['Rating']
history = model.fit(X_train,y_train, epochs=5)

Epoch 1/5