Text Classification with Deep Learning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, LSTM, Bidirectional
from tensorflow.keras.initializers import Constant

In [None]:
# Loading the dataset into dataframe
reviews = pd.read_csv(r"/content/sample_data/restaurant_reviews_az.csv")

In [None]:
# Remove 3-star reviews
reviews = reviews[reviews['stars'] != 3]

# Create a new column 'Sentiment'
reviews['Sentiment'] = reviews['stars'].apply(lambda x: 1 if x > 3 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['Sentiment'] = reviews['stars'].apply(lambda x: 1 if x > 3 else 0)


In [None]:
# Data Processing and Splitting
text = reviews['text'].values
labels = reviews['Sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=42)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=120, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=120, padding='post', truncating='post')

vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Preparing GloVe embedding matrix
embeddings_index = {}
with open('/content/sample_data/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
embedding_matrix.shape

(33420, 100)

In [None]:
# GRU Model with Pre-trained GloVe Embedding
model_gru_glove = Sequential([
    Embedding(vocab_size, 100, embeddings_initializer=Constant(embedding_matrix),
              input_length=120, trainable=False),
    GRU(units=64),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_gru_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_gru_glove.summary()

history_gru_glove = model_gru_glove.fit(X_train_pad, y_train, epochs=20, validation_data=(X_test_pad, y_test))

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 120, 100)          3342000   
                                                                 
 gru_4 (GRU)                 (None, 64)                31872     
                                                                 
 dense_6 (Dense)             (None, 24)                1560      
                                                                 
 dense_7 (Dense)             (None, 1)                 25        
                                                                 
Total params: 3375457 (12.88 MB)
Trainable params: 33457 (130.69 KB)
Non-trainable params: 3342000 (12.75 MB)
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 1

In [None]:
# LSTM Model with Pre-trained GloVe Embedding
model_lstm_glove = Sequential([
    Embedding(vocab_size, 100, embeddings_initializer=Constant(embedding_matrix),
              input_length=120, trainable=False),
    LSTM(units=64),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_lstm_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm_glove.summary()

history_lstm_glove = model_lstm_glove.fit(X_train_pad, y_train, epochs=20, validation_data=(X_test_pad, y_test))

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 120, 100)          3342000   
                                                                 
 lstm_2 (LSTM)               (None, 64)                42240     
                                                                 
 dense_10 (Dense)            (None, 24)                1560      
                                                                 
 dense_11 (Dense)            (None, 1)                 25        
                                                                 
Total params: 3385825 (12.92 MB)
Trainable params: 43825 (171.19 KB)
Non-trainable params: 3342000 (12.75 MB)
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 1

In [None]:
# GRU Model with Trainable Embeddings
model_gru = Sequential([
    Embedding(vocab_size, 100, input_length=120),
    GRU(units=64),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_gru.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_gru.summary()

history_gru = model_gru.fit(X_train_pad, y_train, epochs=5, validation_data=(X_test_pad, y_test))

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 120, 100)          3342000   
                                                                 
 gru_6 (GRU)                 (None, 64)                31872     
                                                                 
 dense_14 (Dense)            (None, 24)                1560      
                                                                 
 dense_15 (Dense)            (None, 1)                 25        
                                                                 
Total params: 3375457 (12.88 MB)
Trainable params: 3375457 (12.88 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
