In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('Final Yelp Restaurant Reviews.csv')

# Preprocessing the data using tfidf
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(data['Review Text']).toarray()

# Converting ratings to categorical sentiment
conditions = [
    data['Rating'] <= 2,  # Negative
    data['Rating'] == 3,  # Neutral
    data['Rating'] >= 4   # Positive
]
choices = [0, 1, 2]
y = np.select(conditions, choices)
y = to_categorical(y, num_classes=3)

# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

# learning rate decay function to adjust the learning rate over the course of training
def lr_time_based_decay(epoch, lr):
    decay = 0.1
    return lr * 1 / (1 + decay * epoch)

# Model architecture : neural network model
model = Sequential([
    Dense(128, activation='relu', input_dim=1000),
    Dropout(0.5),
    BatchNormalization(),
    Dense(64, activation='relu'), 
    Dropout(0.5),
    BatchNormalization(),
    Dense(3, activation='softmax')
])

# Compiling the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Learning rate scheduler
lr_scheduler = LearningRateScheduler(lr_time_based_decay)

# Training the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, 
                    validation_data=(X_val, y_val), 
                    callbacks=[early_stopping, lr_scheduler])

# Evaluating the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")


# Printing classification report for the test set
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
print(classification_report(y_test_labels, y_pred_labels, target_names=['Negative', 'Neutral', 'Positive']))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 86.26%
              precision    recall  f1-score   support

    Negative       0.71      0.72      0.72       516
     Neutral       0.50      0.21      0.30       401
    Positive       0.91      0.97      0.94      3063

    accuracy                           0.86      3980
   macro avg       0.71      0.64      0.65      3980
weighted avg       0.84      0.86      0.84      3980

