<a href="https://colab.research.google.com/github/madhura2024/deep_learning/blob/main/review_gru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================
# IMPORTS
# ===============================
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from textblob import TextBlob

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# ===============================
# LOAD DATASET
# ===============================
# Dataset columns: content, score (1–5)
data = pd.read_csv("app_reviews.csv")

x = data['content']
y = data['score']

# ===============================
# TEXT PREPROCESSING
# ===============================
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

cleaned_corpus = []

for statement in x:
    statement = statement.lower()
    statement = re.sub('[^a-zA-Z]', ' ', statement)
    statement = re.sub('\\s+', ' ', statement).strip()

    words = word_tokenize(statement)
    filtered_words = []

    for word in words:
        if word not in stop_words:
            filtered_words.append(lemmatizer.lemmatize(word))

    cleaned_corpus.append(' '.join(filtered_words))

# ===============================
# TEXTBLOB SENTIMENT ANALYSIS
# ===============================
data['sentiment_score'] = data['content'].apply(
    lambda text: TextBlob(text).sentiment.polarity
)

# ===============================
# TOKENIZATION & PADDING
# ===============================
max_features = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(cleaned_corpus)

sequences = tokenizer.texts_to_sequences(cleaned_corpus)
x_pad = pad_sequences(sequences, maxlen=max_len, padding='pre')

# ===============================
# ONE HOT ENCODING OF SCORES
# ===============================
y = y - 1                      # convert 1–5 → 0–4
y = to_categorical(y, num_classes=5)

# ===============================
# TRAIN TEST SPLIT
# ===============================
x_train, x_test, y_train, y_test = train_test_split(
    x_pad, y, test_size=0.2, random_state=42
)

# ===============================
# GRU MODEL
# ===============================
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=max_len))
model.add(GRU(64))
model.add(Dense(5, activation='softmax'))

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# ===============================
# MODEL TRAINING
# ===============================
history = model.fit(
    x_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2
)

# ===============================
# MODEL EVALUATION
# ===============================
loss, accuracy = model.evaluate(x_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# ===============================
# ACCURACY GRAPH
# ===============================
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

# ===============================
# USER INPUT PREDICTION
# ===============================
def predict_review():
    review = input("\nEnter app review: ")

    review_clean = review.lower()
    review_clean = re.sub('[^a-zA-Z]', ' ', review_clean)
    review_clean = re.sub('\\s+', ' ', review_clean).strip()

    words = [
        lemmatizer.lemmatize(w)
        for w in review_clean.split()
        if w not in stop_words
    ]

    cleaned_text = " ".join(words)

    seq = tokenizer.texts_to_sequences([cleaned_text])
    pad = pad_sequences(seq, maxlen=max_len, padding='pre')

    prediction = model.predict(pad)
    rating = np.argmax(prediction) + 1

    sentiment = TextBlob(review).sentiment.polarity

    print("\nOriginal Review:", review)
    print("Predicted Rating (1–5):", rating)
    print("TextBlob Sentiment Score:", sentiment)

# ===============================
# TEST WITH USER INPUT
# ===============================
predict_review()
predict_review()
