1. Load & Prepare Data

In [1]:
import pandas as pd

# Load your lemmatized balanced data
df = pd.read_csv("data/cleaned_dataset/balanced_data_lemmatized.csv") 
X = df["lemmatized_text"].astype(str)
y = df["Score"]

# Encode labels as integers if necessary
label_to_int = {v:i for i,v in enumerate(sorted(y.unique()))}
y_int = y.map(label_to_int).values
num_classes = len(label_to_int)


2. Tokenize and Pad the Text


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000  # Use top 10k words
max_len = 100      # Max review length (words)

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(sequences, maxlen=max_len)




3. Train-Test Split


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_pad, y_int, test_size=0.2, random_state=42, stratify=y_int
)


4. Build and Train the Deep Learning Model

In [4]:
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(num_classes, activation="softmax")
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# Train
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32
)




Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2729 - loss: 1.5751 - val_accuracy: 0.3585 - val_loss: 1.4902
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4563 - loss: 1.3114 - val_accuracy: 0.5025 - val_loss: 1.1823
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5831 - loss: 1.0396 - val_accuracy: 0.5485 - val_loss: 1.0676
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6596 - loss: 0.8881 - val_accuracy: 0.5880 - val_loss: 1.0084
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7283 - loss: 0.7426 - val_accuracy: 0.6065 - val_loss: 1.0102
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7686 - loss: 0.6428 - val_accuracy: 0.6345 - val_loss: 0.9548
Epoch 7/10
[1m250/250[0m 

5. Save Model, Tokenizer, and Label Mapping

In [5]:
import joblib
import pickle
import json

# Save model
model.save("deep_model_balanced.h5")

# Save tokenizer
with open("tokenizer_balanced.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Convert keys to native Python int
label_to_int_cleaned = {int(k): v for k, v in label_to_int.items()}

# Save label mapping
with open("label_to_int.json", "w") as f:
    json.dump(label_to_int_cleaned, f)





6. Test Loading and Predict Once

In [6]:
import numpy as np
from tensorflow import keras
from keras.utils import pad_sequences

# Load model and tokenizer
dl_model = keras.models.load_model("deep_model_balanced.h5")
with open("tokenizer_balanced.pkl", "rb") as f:
    dl_tokenizer = pickle.load(f)
with open("label_to_int.json", "r") as f:
    label_to_int = json.load(f)
int_to_label = {i:v for v, i in label_to_int.items()}

# Predict example
sample_text = ["This product was excellent!"]
seq = dl_tokenizer.texts_to_sequences(sample_text)
pad = pad_sequences(seq, maxlen=max_len)
pred = dl_model.predict(pad)
pred_class = int(np.argmax(pred))
print("Predicted Score:", int_to_label[pred_class])




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Predicted Score: 5
