In [135]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

[nltk_data] Downloading package stopwords to /home/kali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [136]:
#Implement a Transformer block as a layer
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
#Implement embedding layer
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
#Cleaning and prepare dataset
vocab_size = 20000
maxlen = 100 
text_data = pd.read_csv("/home/kali/Desktop/tripadvisor_hotel_reviews.csv")
stop_words = stopwords.words('english')
filtered_text = []
for i in text_data["Review"]:
    i = re.sub("\d", " ", i)
    #i = re.sub('[%s]' % re.escape(str.punctuation), ' ', i)
    i = re.sub('\n', ' ', i)
    i = re.sub('\s{2,}', " ", i)
    i = re.sub("@\S+", " ", i)
    i = re.sub(',(?!\s+\d$)', '', i)
    i = re.sub("https*\S+", " ", i)
    i = re.sub("#\S+", " ", i)
    i = re.sub("\d", " ", i)
    i = re.sub("https*\S+", " ", i)
    text_tokens = word_tokenize(i)
    for w in text_tokens:
        if w not in stop_words:
            filtered_text.append(w)
x_train, x_val, y_train, y_val = train_test_split(np.array(text_data['Review']), np.array(text_data['Rating']), test_size=0.20)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

16392 Training sequences
4099 Validation sequences


In [137]:
#Create classifier model using transformer layer
embed_dim = 64  
num_heads = 3 
ff_dim = 64  
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs)

In [138]:
#Train and Evaluate
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=64, epochs=3, validation_data=(x_val, y_val)
)

Epoch 1/3
Epoch 2/3
Epoch 3/3
