In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
file_path = "/content/drive/MyDrive/450k_stars.csv"
df = pd.read_csv(file_path)

# ========================
# Step 1: Map sentiment to integer labels
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df['sentiment'] = df['sentiment'].map(label_mapping)

# ========================
# Step 2: Tokenize the reviews
max_words = 20000  # Limit the vocabulary size
max_sequence_length = 200  # Max sequence length to pad/truncate

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['reviews'])

# Convert reviews to sequences and pad them
X = tokenizer.texts_to_sequences(df['reviews'])
X = pad_sequences(X, maxlen=max_sequence_length)

# Extract labels
y = df['sentiment'].values
# Save tokenizer
with open("tokenizer.json", "w") as f:
    f.write(tokenizer.to_json())
# ========================
# Step 3: Optional sample weights (downweight 2★ and 4★)
def compute_sample_weight(row):
    return 0.7 if row['rating'] in [2, 4] else 1.0

sample_weights = df.apply(compute_sample_weight, axis=1).values

# ========================
# Step 4: Train/val/test split
X_train, X_temp, y_train, y_temp, sw_train, sw_temp = train_test_split(
    X, y, sample_weights, test_size=0.2, random_state=42)

X_val, X_test, y_val, y_test, sw_val, sw_test = train_test_split(
    X_temp, y_temp, sw_temp, test_size=0.5, random_state=42)

# ========================
# Step 5: Load GloVe embeddings
glove_path = "/content/drive/MyDrive/glove.6B.100d.txt"
embedding_dim = 100  # عدد الأبعاد في GloVe
# إنشاء قاموس للكلمات وأوزانها من GloVe
embeddings_index = {}
with open(glove_path, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index  # مفردات النموذج
num_words = min(20000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector  # استخدام التضمين من GloVe


# ========================
# Step 6: Build the LSTM model
model = Sequential([
    Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=200, weights=[embedding_matrix], trainable=True),
    Bidirectional(LSTM(64, return_sequences=True)),
    BatchNormalization(),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(3, activation="softmax")
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# ========================
# Step 7: Train the model
early_stopping = EarlyStopping(monitor='val_accuracy', patience=4, mode='max')
lr_scheduler = ReduceLROnPlateau(factor=0.2, patience=2, min_lr=1e-6)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    sample_weight=sw_train,
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# ========================
# Step 8: Evaluate
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"✅ Final Test Accuracy: {accuracy:.2f}")

# ========================
# Step 9: Save model
model.save("sentiment_LSTM_model.h5")





Epoch 1/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 34ms/step - accuracy: 0.6300 - loss: 0.6969 - val_accuracy: 0.7874 - val_loss: 0.5258 - learning_rate: 0.0010
Epoch 2/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 34ms/step - accuracy: 0.8107 - loss: 0.4125 - val_accuracy: 0.8403 - val_loss: 0.4110 - learning_rate: 0.0010
Epoch 3/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 34ms/step - accuracy: 0.8630 - loss: 0.3102 - val_accuracy: 0.8741 - val_loss: 0.3381 - learning_rate: 0.0010
Epoch 4/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 34ms/step - accuracy: 0.8963 - loss: 0.2421 - val_accuracy: 0.8922 - val_loss: 0.2987 - learning_rate: 0.0010
Epoch 5/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 34ms/step - accuracy: 0.9183 - loss: 0.1950 - val_accuracy: 0.9081 - val_loss: 0.2655 - learning_rate: 0.0010
Epoch 6/10
[1m5625/5625[0m [32m━━━━━━━━━━━



✅ Final Test Accuracy: 0.93


In [None]:
import shutil

# Define the source files
source_files = ["/content/sentiment_LSTM_model.h5", "/content/tokenizer.json"]

# Define the destination directory in Google Drive
destination_dir = "/content/drive/MyDrive/"

# Copy each file to the destination directory
for file_path in source_files:
    try:
        shutil.copy(file_path, destination_dir)
        print(f"Copied {file_path} to {destination_dir}")
    except FileNotFoundError:
        print(f"Error: {file_path} not found.")
    except Exception as e:
        print(f"An error occurred while copying {file_path}: {e}")

Copied /content/sentiment_LSTM_model.h5 to /content/drive/MyDrive/
Copied /content/tokenizer.json to /content/drive/MyDrive/
