In [1]:
# ================================
# 1. IMPORT LIBRARIES
# ================================
import pandas as pd
import numpy as np
import re
import pickle

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# ================================
# 2. LOAD DATASET
# ================================
df = pd.read_csv("Reviews.csv")

# Keep only required columns
df = df[['Text', 'Score']]

# Remove null values
df.dropna(inplace=True)

# ================================
# 3. REDUCE DATA SIZE (FAST TRAINING)
# ================================
df = df.sample(50000, random_state=42)

print("Dataset shape:", df.shape)

# ================================
# 4. TEXT PREPROCESSING
# ================================
def text_preprocessing(text):
    
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['Clean_Text'] = df['Text'].apply(text_preprocessing)

# ================================
# 5. TARGET VARIABLE (CRITICALITY)
# ================================
y = df['Score'] / 5.0   # Normalize (0–1)

# ================================
# 6. TOKENIZATION
# ================================
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['Clean_Text'])

sequences = tokenizer.texts_to_sequences(df['Clean_Text'])

# ================================
# 7. PADDING (SHORTER FOR SPEED)
# ================================
max_len = 50

X = pad_sequences(sequences, maxlen=max_len)

print("Input shape:", X.shape)

# ================================
# 8. TRAIN TEST SPLIT
# ================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# ================================
# 9. BUILD OPTIMIZED LSTM MODEL
# ================================
model = Sequential()

model.add(Embedding(
    input_dim=10000,
    output_dim=64,
    input_length=max_len
))

model.add(LSTM(32))   # Reduced units → faster

model.add(Dense(16, activation='relu'))

model.add(Dense(1, activation='sigmoid'))  # Criticality output

model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

model.summary()

# ================================
# 10. TRAIN MODEL (FAST CONFIG)
# ================================
history = model.fit(
    X_train,
    y_train,
    epochs=3,          # Reduced epochs
    batch_size=128,    # Faster training
    validation_data=(X_test, y_test)
)

# ================================
# 11. SAVE MODEL
# ================================
model.save("criticality_model.h5")

# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Model & Tokenizer Saved Successfully ✅")

# ================================
# 12. TEST PREDICTION
# ================================
sample = ["product damaged and support not responding"]

sample_clean = [text_preprocessing(t) for t in sample]

seq = tokenizer.texts_to_sequences(sample_clean)
pad = pad_sequences(seq, maxlen=max_len)

pred = model.predict(pad)[0][0] * 5

print("Predicted Criticality Score:", pred)


Dataset shape: (50000, 2)
Input shape: (50000, 50)




Epoch 1/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 41ms/step - loss: 0.0570 - mae: 0.1809 - val_loss: 0.0368 - val_mae: 0.1333
Epoch 2/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 37ms/step - loss: 0.0318 - mae: 0.1227 - val_loss: 0.0332 - val_mae: 0.1271
Epoch 3/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 36ms/step - loss: 0.0254 - mae: 0.1055 - val_loss: 0.0335 - val_mae: 0.1168




Model & Tokenizer Saved Successfully ✅
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step
Predicted Criticality Score: 2.0634224
