In [2]:
# Task 09
# NLP 
# Sentiment Analysis with RNN

In [3]:
import pandas as pd
import numpy as np
import re  
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding


In [4]:
# load the dataset
data = pd.read_csv('swiggy.csv')
print("Columns in the dataset:")
print(data.columns.tolist())


Columns in the dataset:
['ID', 'Area', 'City', 'Restaurant Price', 'Avg Rating', 'Total Rating', 'Food Item', 'Food Type', 'Delivery Time', 'Review']


In [5]:
# Text Cleaning and Sentiment Labeling
data["Review"] = data["Review"].str.lower()
data["Review"] = data["Review"].replace(r'[^a-z0-9\s]', '', regex=True)

data['sentiment'] = data['Avg Rating'].apply(lambda x: 1 if x > 3.5 else 0)
data = data.dropna() 


In [6]:
# Tokenization and Padding
max_features = 5000  
max_length = 200    

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data["Review"])
X = pad_sequences(tokenizer.texts_to_sequences(data["Review"]), maxlen=max_length)
y = data['sentiment'].values  


In [7]:
# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)


In [8]:
# Build RNN Model
model = Sequential([
    Embedding(input_dim=max_features, output_dim=16, input_length=max_length),
    SimpleRNN(64, activation='tanh', return_sequences=False),                   
    Dense(1, activation='sigmoid')                                          
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)




In [9]:
# Train & Evaluate Model
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1
)

score = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {score[1]:.2f}")


Epoch 1/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 53ms/step - accuracy: 0.7076 - loss: 0.6032 - val_accuracy: 0.7156 - val_loss: 0.5977
Epoch 2/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 40ms/step - accuracy: 0.7132 - loss: 0.6013 - val_accuracy: 0.7156 - val_loss: 0.5986
Epoch 3/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 39ms/step - accuracy: 0.7251 - loss: 0.5907 - val_accuracy: 0.7156 - val_loss: 0.5968
Epoch 4/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 39ms/step - accuracy: 0.7076 - loss: 0.6030 - val_accuracy: 0.7156 - val_loss: 0.5964
Epoch 5/5
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 38ms/step - accuracy: 0.7228 - loss: 0.5908 - val_accuracy: 0.7156 - val_loss: 0.6002
Test accuracy: 0.72


In [15]:
# Predicting Sentiment
def predict_sentiment(review_text):
    text = review_text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_length)

    prediction = model.predict(padded)[0][0]
    return f"{'Positive' if prediction >= 0.5 else 'Negative'} (Probability: {prediction:.2f})"
    
sample_review = "Worst food I’ve ever had. Totally disappointed."
print(f"Review: {sample_review}")
print(f"Sentiment: {predict_sentiment(sample_review)}")


Review: Worst food I’ve ever had. Totally disappointed.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
Sentiment: Positive (Probability: 0.77)


In [11]:
model.save('sentiment_model.h5')




In [12]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
