<a href="https://colab.research.google.com/github/nazaninzareirad/Amazon-Review-Analysis/blob/NLP/Q2_P3_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Download the dataset
import gdown

google_drive_url = 'https://drive.google.com/file/d/1-F7f9oSjFx0Si44DvdkzFhjjwfro_Y1p/view?usp=sharing'
file_id = google_drive_url.split("/")[-2]
output = 'train_data.csv'
gdown.download(f'https://drive.google.com/uc?id={file_id}', output, quiet=False)
df = pd.read_csv(output, low_memory=False)
df.columns

Downloading...
From: https://drive.google.com/uc?id=1-F7f9oSjFx0Si44DvdkzFhjjwfro_Y1p
To: /content/train_data.csv
100%|██████████| 635M/635M [00:08<00:00, 74.0MB/s]


Index(['overall', 'vote', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime'],
      dtype='object')

In [3]:
df['overall'] = df['overall'].map({1: 0, 2: 1, 3: 2, 4: 3, 5: 4})

# Fill missing values
df['summary'] = df['summary'].fillna('')
df['reviewText'] = df['reviewText'].fillna('')

df['full_text'] = df['summary'] + " " + df['reviewText']

# Split training and validation sets
y= df['overall']
X = df['full_text']

In [4]:
from nltk.stem import PorterStemmer
import re
stem = PorterStemmer()
def clean_text(text):
  text = re.sub(r"[-()\"#/@;:<>{}=~|.?,]", "", text)
  text = stem.stem(text)
  return text

df['cleaned'] = df['full_text'].apply(clean_text)

In [5]:

X_train, X_val, y_train, y_val = train_test_split(df['cleaned'], y, test_size=0.2, random_state=42)

In [6]:
naive_bayes_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

naive_bayes_model.fit(X_train, y_train)

# Evaluate on the validation set
y_pred_nb = naive_bayes_model.predict(X_val)

accuracy_nb = accuracy_score(y_val, y_pred_nb)
classification_report_nb = classification_report(y_val, y_pred_nb)

print(f"Naive Bayes Accuracy: {accuracy_nb}")
print("Classification Report:")
print(classification_report_nb)

Naive Bayes Accuracy: 0.6462342585032392
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.70      0.58     16572
           1       0.37      0.07      0.12     11347
           2       0.36      0.20      0.26     16109
           3       0.41      0.46      0.44     31292
           4       0.79      0.85      0.82     92469

    accuracy                           0.65    167789
   macro avg       0.49      0.46      0.44    167789
weighted avg       0.62      0.65      0.62    167789



In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


Y1h = pd.get_dummies(y)
X_train, X_val, y_train, y_val = train_test_split(df['cleaned'], Y1h, test_size=0.2, random_state=42)
print(y_val)

        0  1  2  3  4
479970  0  0  0  1  0
529489  0  0  0  0  1
380130  0  0  0  0  1
568335  0  0  0  0  1
355122  0  0  0  0  1
...    .. .. .. .. ..
784600  0  0  1  0  0
263140  0  0  1  0  0
458471  0  0  0  1  0
485214  0  0  0  0  1
618922  0  1  0  0  0

[167789 rows x 5 columns]


In [8]:
# Tokenize
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
vocabulary_size = len(word_index) + 1

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

# Pad sequences for consistent length
max_sequence_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_val_padded = pad_sequences(X_val_seq, maxlen=max_sequence_length)

In [9]:
!wget https://nlp.stanford.edu/data/glove.42B.300d.zip

--2023-11-12 15:14:25--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2023-11-12 15:14:26--  https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]
Saving to: ‘glove.42B.300d.zip’


2023-11-12 15:20:21 (5.05 MB/s) - ‘glove.42B.300d.zip’ saved [1877800501/1877800501]



In [10]:
!unzip /content/glove.42B.300d.zip

Archive:  /content/glove.42B.300d.zip
  inflating: glove.42B.300d.txt      


In [11]:
from tqdm.notebook import tqdm
embeddings_index = {}
f = open('/content/glove.42B.300d.txt', encoding = 'utf8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

0it [00:00, ?it/s]

In [12]:
word_index = tokenizer.word_index
vocabulary_size = len(word_index) + 1
embedding_matrix = np.random.random((vocabulary_size + 1, 300))

for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

  0%|          | 0/422019 [00:00<?, ?it/s]

In [13]:
from keras.layers import Dense, LSTM, Embedding, Dropout, Activation, GRU, Bidirectional, Input
from keras.models import Model

embedding_layer = Embedding(vocabulary_size + 1, 300, weights = [embedding_matrix], input_length = 100, trainable = True)

# embedding layer is the first layer, it is followed by a gru layer and finally a regular dense output layer
sequence_input = Input(shape = (100,), dtype = 'float64')
embedded_sequences = embedding_layer(sequence_input)
l_gru = Bidirectional(GRU(256))(embedded_sequences)
preds = Dense(5, activation='softmax')(l_gru)
model = Model(sequence_input, preds)
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.fit(X_train_padded, y_train, validation_data = (X_val_padded, y_val), epochs = 8, batch_size = 64)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.src.callbacks.History at 0x7e6ff01a57b0>