<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/NLP-Projects/blob/main/SentimentAnalysis-Pretrained-Word2Vec-Weights/sentiment_analysis_word2vec_embd_weights_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install -q gensim

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Imports
import numpy as np
import pandas as pd
import tensorflow as tf
import gensim.downloader as api
import requests
import zipfile
import io
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout

# Sentiment Analysis Pipeline

In [9]:
class SentimentPipeline:
    """
    A professional pipeline for Sentiment Analysis using Pre-trained Word2Vec (Transfer Learning).
    """
    def __init__(self, config):
        """
        Initialize the pipeline with configuration parameters
        """
        self.config = config
        self.tokenizer = None
        self.embedding_matrix = None
        self.model = None
        self.w2v_model = None

        # Data
        self.sentences = None
        self.labels = None
        self.padded_sequences = None
        print("Pipeline Initialized")

    def load_data(self):
        """
        Download and load the UCI Sentiment Dataset
        """
        print("[1/6] Downloading and Loading Data...")
        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip"

        try:
            r = requests.get(url)
            z = zipfile.ZipFile(io.BytesIO(r.content))
            with z.open("sentiment labelled sentences/amazon_cells_labelled.txt") as f:
                df = pd.read_csv(f, sep='\t', names=['sentence', 'label'], quoting=3)
            self.sentences = df['sentence'].values
            self.labels = df['label'].values
            print(f"Data Loaded successfully. Shape: {df.shape}")
        except Exception as e:
            print(f"Error loading data: {e}")


    def preprocess_data(self):
        """
        Tokenization & Padding
        """
        print("[2/6] Preprocessing Data...")
        self.tokenizer = Tokenizer(num_words=self.config['max_vocab_size'], oov_token="<OOV>")
        self.tokenizer.fit_on_texts(self.sentences)

        # Convert to sequences
        sequences = self.tokenizer.texts_to_sequences(self.sentences)

        # Pad sequences
        self.padded_sequences = pad_sequences(
            sequences,
            maxlen=self.config['max_seq_length'],
            padding='post', truncating='post')

        self.word_index = self.tokenizer.word_index
        print(f"Preprocessing complete. Unique tokens: {len(self.word_index)}")


    def load_embeddings(self):
        """
        Load google's pre-trained word2vec model.
        """
        print("[3/6] Loading Embeddings...")
        self.w2v_model = api.load('word2vec-google-news-300')
        print("Embeddings Loaded")


    def create_embedding_matrix(self):
        """
        Create the weight matrix for keras embedding layer
        """
        print("[4/6] Creating Embedding Matrix...")
        num_words = min(self.config['max_vocab_size'], len(self.word_index) + 1)
        embedding_dim = self.config['embedding_dim']

        self.embedding_matrix = np.zeros((num_words, embedding_dim))
        hits, misses = 0, 0

        for word, i in self.word_index.items():
            if i >= self.config['max_vocab_size']:
                continue

            if word in self.w2v_model:
                self.embedding_matrix[i] = self.w2v_model[word]
                hits += 1
            else:
                misses += 1

        print(f"Matrix created. Hits: {hits}, Misses: {misses}")
        print(f"Knowledge transfer rate: {hits / (hits + misses) * 100:.2f}%")


    def build_model(self):
        """
        Define the Keras architecture.
        """
        print("[5/6] Building Neural Network...")

        num_words = self.embedding_matrix.shape[0]

        self.model = Sequential([
            # Pre-trained Embedding Layer (Frozen)
            Embedding(
                input_dim=num_words,
                output_dim=self.config['embedding_dim'],
                input_length=self.config['max_seq_length'],
                weights=[self.embedding_matrix],
                trainable=False
            ),
            GlobalAveragePooling1D(),
            Dense(24, activation='relu'),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
        ])

        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        self.model.summary()


    def train(self):
        """
        Train the model
        """
        print("[6/6] Starting Training....")
        self.history = self.model.fit(
            self.padded_sequences,
            self.labels,
            epochs=self.config['epochs'],
            batch_size=self.config['batch_size'],
            validation_split=0.2,
            verbose=1
        )


    def run(self):
        """
        Execute the full training pipeline.
        """
        self.load_data()
        self.preprocess_data()
        self.load_embeddings()
        self.create_embedding_matrix()
        self.build_model()
        self.train()


    def predict(self, text):
        """
        Inference method for new data.
        """
        if self.model is None:
            print("Error: Model is not trained yet.")
            return

        seq = self.tokenizer.texts_to_sequences([text])
        pad = pad_sequences(seq, maxlen=self.config['max_seq_length'], padding='post', truncating='post')

        score = self.model.predict(pad, verbose=0)[0][0]
        label = "POSITIVE" if score > 0.5 else "NEGATIVE"

        return label, score

In [10]:
# Main Execution
# Config
config = {
    'max_vocab_size': 5000,
    'max_seq_length': 50,
    'embedding_dim': 300,
    'epochs': 20,
    'batch_size': 32
}

# Instantiate and run pipeline
pipeline = SentimentPipeline(config)
pipeline.run()

# Test with custom sentences
print("\n" + "="*30)
print("TESTING THE MODEL")
print("="*30)

test_sentences = [
    "I absolutely loved this movie, it was fantastic!",
    "It was a complete waste of time and money.",
    "The acting was okay but the plot was boring.",
    "Highly recommended for everyone."
]

for sentence in test_sentences:
    label, score = pipeline.predict(sentence)
    print(f"Text: '{sentence}'")
    print(f"Prediction: {label} ({score:.4f})\n")

Pipeline Initialized
[1/6] Downloading and Loading Data...
Data Loaded successfully. Shape: (1000, 2)
[2/6] Preprocessing Data...
Preprocessing complete. Unique tokens: 1879
[3/6] Loading Embeddings...
Embeddings Loaded
[4/6] Creating Embedding Matrix...
Matrix created. Hits: 1758, Misses: 121
Knowledge transfer rate: 93.56%
[5/6] Building Neural Network...




[6/6] Starting Training....
Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.5136 - loss: 0.6937 - val_accuracy: 0.6700 - val_loss: 0.6895
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6189 - loss: 0.6876 - val_accuracy: 0.6600 - val_loss: 0.6854
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6628 - loss: 0.6814 - val_accuracy: 0.7050 - val_loss: 0.6794
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.6844 - loss: 0.6737 - val_accuracy: 0.6850 - val_loss: 0.6735
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.7169 - loss: 0.6625 - val_accuracy: 0.7200 - val_loss: 0.6646
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7115 - loss: 0.6536 - val_accuracy: 0.7050 - val_loss: 0.6570
Epoch 