# Sentiment Analysis with Amazon Reviews

In [1]:
import os
import pandas as pd
import numpy as np
print(os.listdir("data/Amazon_Reviews"))

['train.ft.txt.bz2', 'test.ft.txt.bz2']


## Reading the text

In [2]:
import bz2

In [3]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('data/Amazon_Reviews/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('data/Amazon_Reviews/test.ft.txt.bz2')

In [4]:
len(train_texts)

3600000

## Text Preprocessing

WHAT ARE STOPWORDS?

Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. For example, the words like the, he, have etc. Such words are already captured this in corpus named corpus. We first download it to our python environment.

In [5]:
import nltk
from nltk.corpus import stopwords
from string import punctuation
from bs4 import BeautifulSoup
import re

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/PHN-624_2024/lucky_kushwaha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)
stop

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

#### DATA CLEANING

In [8]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)

#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

In [9]:
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

In [10]:
#Apply function on review column
train_texts = list(map(denoise_text, train_texts))

  soup = BeautifulSoup(text, "html.parser")


In [11]:
test_texts = list(map(denoise_text, test_texts))

  soup = BeautifulSoup(text, "html.parser")


# Train/Validation Split

In [25]:
from sklearn.model_selection import train_test_split
from keras.preprocessing import text, sequence
from tensorflow.keras import layers, models

In [13]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, random_state=57643892, test_size=0.2)

In [17]:
MAX_FEATURES = 12000
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

In [19]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = sequence.pad_sequences(train_texts, maxlen=MAX_LENGTH)
val_texts = sequence.pad_sequences(val_texts, maxlen=MAX_LENGTH)
test_texts = sequence.pad_sequences(test_texts, maxlen=MAX_LENGTH)

# Convolutional Neural Net Model
This CNN has an embedding with a dimension of 64, 3 convolutional layers with the first two having match normalization and max pooling and the last with global max pooling. The results are then passed to a dense layer and then the output.

In [26]:
def build_cnn_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

In [27]:
model = build_cnn_model()

In [28]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 158)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 158, 64)           768000    
                                                                 
 conv1d_3 (Conv1D)           (None, 156, 64)           12352     
                                                                 
 batch_normalization_2 (Bat  (None, 156, 64)           256       
 chNormalization)                                                
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 52, 64)            0         
 g1D)                                                            
                                                                 
 conv1d_4 (Conv1D)           (None, 48, 64)            20544 

In [30]:
model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=2,
    validation_data=(val_texts, val_labels), )

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7fb5e36da6d0>

In [32]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [34]:
preds = model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

Accuracy score: 0.8994
F1 score: 0.8996
ROC AUC score: 0.9652


# Recurrent Neural Net Model

In [53]:
def build_rnn_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.LSTM(128, return_sequences=False)(embedded)
    # x = layers.LSTM(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    # x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model
    
rnn_model = build_rnn_model()

In [54]:
rnn_model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 158)]             0         
                                                                 
 embedding_8 (Embedding)     (None, 158, 64)           768000    
                                                                 
 lstm_6 (LSTM)               (None, 128)               98816     
                                                                 
 dense_18 (Dense)            (None, 32)                4128      
                                                                 
 dense_19 (Dense)            (None, 1)                 33        
                                                                 
Total params: 870977 (3.32 MB)
Trainable params: 870977 (3.32 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [55]:
history = rnn_model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=1,
    validation_data=(val_texts, val_labels), )



In [62]:
# history.history

In [63]:
preds = rnn_model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

Accuracy score: 0.9245
F1 score: 0.925
ROC AUC score: 0.9778
