<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/NLP-Projects/blob/main/SentimentAnalysis-Pretrained-Word2Vec-Weights/sentiment_analysis_word2vec_embd_weights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install -q gensim

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import numpy as np
import tensorflow as tf
import gensim.downloader as api
import pandas as pd
import requests
import zipfile
import io
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout

# Download & Load Dataset

In [6]:
print("Downloading Dataset...")
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
with z.open('sentiment labelled sentences/imdb_labelled.txt') as f:
    # Read as CSV (tab separated)
    df = pd.read_csv(f, sep='\t', names=['sentence', 'label'], quoting=3)
print("Dataset Downloaded!")

Downloading Dataset...
Dataset Downloaded!


In [8]:
# Display data samples
print(f"Shape: {df.shape}")
print("Sample Positive Review:")
print(df[df['label'] == 1].iloc[0]['sentence'])
print("Sample Negative Review:")
print(df[df['label'] == 0].iloc[0]['sentence'])

Shape: (1000, 2)
Sample Positive Review:
The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  
Sample Negative Review:
A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  


In [9]:
sentences = df['sentence'].values
labels = df['label'].values

# Preprocessing

In [10]:
# Tokenization & Padding
print("Preprocessing....")
MAX_VOCAB_SIZE = 5000
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
# convert text to sequences
sequences = tokenizer.texts_to_sequences(sentences)

# Padding
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")

# Word Index
# dict mapping words to integer
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")
print(f"Data shape after padding: {padded_sequences.shape}")

Preprocessing....
Found 3134 unique tokens.
Data shape after padding: (1000, 50)


# Load Pre-trained Word2Vec

In [11]:
print("Loading Google Word2Vec...")
w2v_model = api.load("word2vec-google-news-300")
print("Word2Vec Loaded!")

Loading Google Word2Vec...
Word2Vec Loaded!


# Creating Embedding Matrix

In [12]:
print("Injecting knowledge....")
num_words = min(MAX_VOCAB_SIZE, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

hits = 0
misses = 0

for word, i in word_index.items():
    if i >= MAX_VOCAB_SIZE:
        continue

    if word in w2v_model:
        embedding_matrix[i] = w2v_model[word]
        hits += 1
    else:
        misses += 1
print(f"Converted {hits} words ({hits/(hits+misses)*100:.1f}%)")
print(f"Missed {misses} words (will be learned from scratch or ignored)")

Injecting knowledge....
Converted 2872 words (91.6%)
Missed 262 words (will be learned from scratch or ignored)


# Build & Train Keras Model

In [13]:
print("Training Model....")
model = Sequential([
    Embedding(
        input_dim = num_words,
        output_dim = EMBEDDING_DIM,
        input_length = MAX_SEQUENCE_LENGTH,
        weights = [embedding_matrix], # Injection Word2Vec Weights
        trainable = False
    ),
    GlobalAveragePooling1D(),
    Dense(24, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Training Model....




In [14]:
# Train
# We use a validation split to monitor performance
history = model.fit(
    padded_sequences,
    labels,
    epochs=30,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


Epoch 1/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5137 - loss: 0.6933 - val_accuracy: 0.4700 - val_loss: 0.6910
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5526 - loss: 0.6889 - val_accuracy: 0.5200 - val_loss: 0.6870
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6189 - loss: 0.6817 - val_accuracy: 0.5450 - val_loss: 0.6808
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6327 - loss: 0.6791 - val_accuracy: 0.6850 - val_loss: 0.6714
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6633 - loss: 0.6679 - val_accuracy: 0.5800 - val_loss: 0.6682
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6966 - loss: 0.6538 - val_accuracy: 0.7300 - val_loss: 0.6545
Epoch 7/30
[1m25/25[0m [32m━━━━━━━━━

# Testing the Model

In [15]:
print("Testing....")

def predict_sentiment(text):
    # Process the new text exactly like training data
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    score = model.predict(pad, verbose=0)[0][0]

    label = "POSITIVE" if score > 0.5 else "NEGATIVE"
    print(f"Text: '{text}'")
    print(f"Sentiment: {label} ({score:.4f})\n")

predict_sentiment("This movie was a masterpiece and I loved every second.")
predict_sentiment("Total waste of time, do not watch this garbage.")
predict_sentiment("It was okay, not great but not terrible.") # Ambiguous
predict_sentiment("The actor was good but the script was boring.") # Mixed

Testing....
Text: 'This movie was a masterpiece and I loved every second.'
Sentiment: POSITIVE (0.8173)

Text: 'Total waste of time, do not watch this garbage.'
Sentiment: NEGATIVE (0.1525)

Text: 'It was okay, not great but not terrible.'
Sentiment: NEGATIVE (0.2765)

Text: 'The actor was good but the script was boring.'
Sentiment: NEGATIVE (0.2899)

