# Sentiment Analysis using Word2Vec + LSTM

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:

# Load and preprocess dataset
data = pd.read_csv('data/Reviews.csv')
data = data[['Text', 'Score']].dropna()

# Convert scores to sentiment labels
def to_sentiment(score):
    if score >= 4:
        return 1  # Positive
    elif score <= 2:
        return 0  # Negative
    else:
        return None

data['Sentiment'] = data['Score'].apply(to_sentiment)
data = data.dropna(subset=['Sentiment'])
data['Sentiment'] = data['Sentiment'].astype(int)

# Clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text.lower())
    return text

data['Clean_Text'] = data['Text'].apply(clean_text)

# Word Tokens
data['Tokens'] = data['Clean_Text'].apply(word_tokenize)
"""
We first use nltk.word_tokenize() to split each sentence into a list of words like:

"This is good" → ["This", "is", "good"]

This gives us clean input for training a Word2Vec embedding model.
"""

In [None]:

# Train Word2Vec model
"""Word2Vec learns a dense vector representation (embedding) for each word based on context.
Each word becomes a 300-dimensional numeric vector capturing meaning (similar words have similar vectors).
The embeddings will later be used to initialize the LSTM model’s embedding layer."""
w2v_model = Word2Vec(sentences=data['Tokens'], vector_size=300, window=5, min_count=2, workers=4)
vocab = w2v_model.wv.key_to_index
embedding_dim = 300

# Tokenize text for LSTM
"""The LSTM model can’t take words directly — it needs integer-encoded sequences (like [5, 23, 67, 1, ...]).

The Keras Tokenizer builds a word-index mapping and converts each tokenized review into a list of word IDs.
"""
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Tokens'])
sequences = tokenizer.texts_to_sequences(data['Tokens'])
word_index = tokenizer.word_index

max_len = 300
X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
y = data['Sentiment'].values


In [None]:

# Create embedding matrix from Word2Vec
"""We link the Word2Vec vectors to the corresponding token indices used by Keras.

The LSTM’s embedding layer will use this matrix so that each word ID gets the right 300-D vector from Word2Vec."""
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]


In [9]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))
print("Class Weights:", class_weights)


Class Weights: {0: 3.2033491729872976, 1: 0.592477879845518}


## Summary for the model pipline

| Layer                    | Purpose                                     | Key Benefit                      |
| ------------------------ | ------------------------------------------- | -------------------------------- |
| **Embedding (Word2Vec)** | Map word indices to semantic vectors        | Leverage pretrained word meaning |
| **BiLSTM (64 units)**    | Learn sequence context from both directions | Understand full sentence meaning |
| **Dropout (0.4)**        | Regularization                              | Prevent overfitting              |
| **Dense (64, ReLU)**     | Learn nonlinear combinations                | Add model capacity               |
| **Dropout (0.3)**        | Regularization                              | Improve generalization           |
| **Dense (1, Sigmoid)**   | Output probability                          | Binary classification            |
| **Adam Optimizer**       | Efficient gradient updates                  | Fast & stable convergence        |
| **Binary Crossentropy**  | Measure prediction error                    | Ideal for binary targets         |
| **Accuracy Metric**      | Evaluate performance                        | Easy to interpret                |


In [None]:

# Build LSTM model
model = Sequential([
    Embedding(input_dim=len(word_index)+1, output_dim=embedding_dim, weights=[embedding_matrix],
              input_length=max_len, trainable=False),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()




In [11]:

# Train model
history = model.fit(
    X_train, y_train,
    epochs=8,
    batch_size=64,
    validation_split=0.2,
    class_weight=class_weights
)


Epoch 1/8
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 28ms/step - accuracy: 0.7290 - loss: 0.4972 - val_accuracy: 0.9004 - val_loss: 0.2495
Epoch 2/8
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 28ms/step - accuracy: 0.8804 - loss: 0.2852 - val_accuracy: 0.8949 - val_loss: 0.2541
Epoch 3/8
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 28ms/step - accuracy: 0.9057 - loss: 0.2309 - val_accuracy: 0.9110 - val_loss: 0.2281
Epoch 4/8
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 28ms/step - accuracy: 0.9208 - loss: 0.1972 - val_accuracy: 0.9432 - val_loss: 0.1509
Epoch 5/8
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 28ms/step - accuracy: 0.9292 - loss: 0.1783 - val_accuracy: 0.9330 - val_loss: 0.1823
Epoch 6/8
[1m5259/5259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 28ms/step - accuracy: 0.9355 - loss: 0.1626 - val_accuracy: 0.9415 - val_loss: 0.1588
Epoc

In [12]:

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.3f}")

# Check prediction distribution
preds = (model.predict(X_test) > 0.5).astype(int)
print(pd.Series(preds.flatten()).value_counts(normalize=True))


[1m3287/3287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 11ms/step - accuracy: 0.9467 - loss: 0.1403
Test Accuracy: 0.946
[1m3287/3287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 9ms/step
1    0.809676
0    0.190324
Name: proportion, dtype: float64


In [None]:
# Function to predict with the model
def predict_sentiment(text):
    # The model expects sequences of token indices, not vector sequences from preprocess_text
    # So, we need to use the tokenizer and pad_sequences as done for training data

    # Clean text
    cleaned_text = clean_text(text)
    # Tokenize text
    tokens = word_tokenize(cleaned_text)
    # Convert tokens to sequences
    sequence = tokenizer.texts_to_sequences([tokens])
    # Pad sequences
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    prediction = model.predict(padded_sequence)
    print(prediction)
    if prediction[0][0] >0.5:
        return "Positive"
    else:
        return "Negative"

In [41]:
test_positive = predict_sentiment("The smell isn't bad at all and only mildly stings on sensitive areas, but it doesn't last long (60-120 seconds) and the 'burning' usually stops. As for the hair, it gets annihilated... I've never seen anything like it, it was effortless and clean-up is pretty easy all things considered. I'm in love with this product... Works for men too!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[[0.92120713]]


In [42]:
test_negative = predict_sentiment("""I used this product several years ago and it was quite effective so I decided to try again.

Unfortunately, this product is not as effective. Despite following the long list of directions and preparation of my skin, it took many times to get most of the hair removed and it still left some stranglers.

In addition, somehow this product is messier than before.

Formula may have been changed and I missed that part but it was not at expected.

Lastly, because of the multiple applications needed it took nearly half the container for one session.""")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[[0.01586174]]


In [38]:
test_positive, test_negative

('Positive', 'Negative')

In [None]:
# save the model, word2vec model abd tokenizer
if test_positive == 'Positive' and test_negative == 'Negative':
  print('Model saved')
  model.save('model/sentiment_analysis_model.h5')
  w2v_model.save('model/word2vec_model.bin')
  import pickle
  with open('model/tokenizer.pickle', 'wb') as handle:
      pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



Model saved
