<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Sentiment%20Analysis%20-%20Sentiment140%20Dataset%20LSTM%20Model/Sentiment_Analysis_Sentiment140_Dataset_LSTM_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libs & Setup Enviorment

In [9]:
# Import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM ,Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
import matplotlib.pyplot as plt
import re

# Setup warnings
import warnings
warnings.filterwarnings('ignore')

# Setup logger
import logging
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.ERROR,
    handlers=[
        logging.FileHandler('app.log'),
        logging.StreamHandler()
    ])
logger = logging.getLogger(__name__)
logger.info("Logger started...")

# Download dataset
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")
print("Path to dataset files:", path)
logger.info(f"Dataset download to {path}")

Path to dataset files: /kaggle/input/sentiment140


# Define Parameters

In [12]:
DATASET_PATH = '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv'
VOCAB_SIZE = 15000          # Number of most frequent words (used as input_dim for Embedding)
EMBEDDING_DIM = 128         # Size of word vector (output_dim for Embedding)
MAX_LENGTH = 30             # Max tweet length (input_length for Embedding, timesteps for LSTM)
BATCH_SIZE = 256            # Number of samples per batch
EPOCHS = 15                 # Number of traning epochs
TRAIN_SAMPLES = 100000      # Limit training to 100,000 tweets for faster training

# Load & Preprocess Dataset

In [13]:
# Dataset contains: polarity(0 = negative, 4=positive), text (tweet text)
# Load only a subset for faster training
try:
    logger.info("Loading Sentiment140 Dataset...")
    data = pd.read_csv(DATASET_PATH,
                       encoding='latin-1',
                       names=['polarity', 'id', 'date', 'query', 'user', 'text']
                       ).sample(TRAIN_SAMPLES)
    logger.info(f"{TRAIN_SAMPLES} Loaded from dataset.")
except Exception as e:
    logger.error(f"Error loading dataset: {e}")

In [15]:
# Convert & correct labels of target
try:
    logger.info("Encoding target variables.")
    data['polarity'] = data['polarity'].map({
                                                0: 1,
                                                4: 1
                                                })
except Exception as e:
    logger.error(f"Error encoding target variables: {e}")
    raise

In [16]:
# Cleaning text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)     # remove mentions
    text = re.sub(r'#\w+', '', text)     # remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text.lower()
data['text_processed'] = data['text'].apply(clean_text)

# Spliting Train & Test Sets

In [20]:
try:
    logger.info("Splitting Train/Test.")
    train_size = int(0.8 * len(data))
    train_data = data[:train_size]
    test_data = data[train_size:]

    # train-test
    X_train = train_data['text_processed']
    y_train = train_data['polarity']
    X_test = test_data['text_processed']
    y_test = test_data['polarity']
    logger.info(f"Data Splitted: X_train={len(X_train)}\t y_train={len(y_train)}\t X_test={len(X_test)}\t y_test={len(y_test)}")
except Exception as e:
    logger.error(f"Error splitting data: {e}")


print(f"Training tweets: {len(X_train)}")
print(f"Testing tweets: {len(y_train)}")
print(f"y_train shape: {y_train.shape}")
print(f"y_train shape: {y_test.shape}")

Training tweets: 80000
Testing tweets: 80000
y_train shape: (80000,)
y_train shape: (20000,)


# Tokenization & Text-to-Seq conversion

In [31]:
try:
    logger.info("Converting words to IDs")
    # Convert words to token IDs using tensorflow Tokenizer
    tokenizer = Tokenizer(num_words=VOCAB_SIZE,
                        oov_token="<oov>")
    tokenizer.fit_on_texts(X_train)

    # Convert texts to integer sequences
    logger.info("Converting texts to integer sequences")
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    logger.info("Conversion successfully done.")
except Exception as e:
    logger.error(f"Error converting words to IDs: {e}")
    raise

In [32]:
# Show example
print(f"Sample tweet: {X_train.iloc[0]}")
print(f"Tokenized sequence: {X_train_seq[0]}")
# Each word is mapped to an ID between 1 and VOCAB_SIZE-1
# <OOV> (out-of-vocabulary) words are mapped to ID=1

# Explanation:
# - 80000/20000: number of tweets
# - 30: MAX_LENGTH
# Each element in x_train_padded is a token ID between 0 and VOCAB_SIZE-1 (14999)

Sample tweet:  you are probably very interesting in real life just a guess but ill give you the benefit of the doubt 
Tokenized sequence: [8, 34, 398, 110, 685, 11, 405, 176, 21, 5, 255, 20, 101, 316, 8, 4, 5474, 13, 4, 1196]


In [38]:
#  Padding sequences to uniform length
# All tweets should be of MAX_LENGTH
# If shorter, pad with 0s (padding='pre'); if longer, truncate from start (truncating='pre')
try:
    logger.info("Padding sequences to uniform length")
    X_train_padded = pad_sequences(X_train_seq,
                                maxlen=MAX_LENGTH,
                                padding='pre', truncating='pre')
    X_test_padded = pad_sequences(X_test_seq,
                                maxlen=MAX_LENGTH,
                                padding='pre', truncating='pre')
    logger.info("Padding successfully done.")
except Exception as e:
    logger.error(f"Error padding sequences: {e}")
    raise

# Check padded shapes
print(f"x_train_padded shape: {X_train_padded.shape}")  # e.g., (80000, 30)
print(f"x_test_padded shape: {X_test_padded.shape}")    # e.g., (20000, 30)

x_train_padded shape: (80000, 30)
x_test_padded shape: (20000, 30)


# Define Model

In [39]:
try:
    logger.info("Starting to build a LSTM Model.")
    model = Sequential([
        # Embedding:
        # Input: [batch_size, input_length] ==> e.g [256, 30]
        # Output: [batch_size, input_length, embedding_dim] ==> e.g. [256, 30, 128]
        Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),

        # LSTM:
        # Input: [batch_size, timesteps, featuers] ==> e.g. [256, 30, 128]
        # Output: [batch_size, timesteps, units] = [256, 30, 100] (returen_sequences=True)
        LSTM(units=100, return_sequences=True, recurrent_dropout=0.2),
        Dropout(0.2),

        # LSTM 2:
        LSTM(units=100, recurrent_dropout=0.2),
        Dropout(0.2),

        # Dense:
        # for binary classification
        # Output: [batch_size, 1], activation='sigmoid'
        Dense(1, activation='sigmoid')
    ])

    logger.info("Model created successfully.")

    # Compile Model
    optimizer = tf.keras.optimizers.Adam(clipnorm=0.2)
    loss = tf.keras.losses.BinaryCrossentropy()
    logger.info(f"Model's Optimizer: {optimizer}\t Loss: {loss}")

    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    logger.info("Model compiled successfully.")
except Exception as e:
    logger.error(f"Error building model: {e}")
    raise

# Check summary
model.summary()

In [35]:
# Configure callbacks
# ReduceLROnPlateau: reduce learning rate if val_loss doesn't improve
reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.2,
                              patience=3,
                              min_lr=0.0001)
# ModelCheckpoint: save the best model based on val_loss
checkpoint = ModelCheckpoint('sentiment140_model.h5',
                             monitor='val_loss',
                             save_best_only=True)

In [41]:
# Train the model
# Input: x_train_padded with shape [80000, 30]
# Output: y_train with shape [80000,] (binary labels)
# validation_split=0.2 means 20% of training data is used for validation
try:
    logger.info(f"Starting to train the model for {EPOCHS} epochs")
    history = model.fit(
        X_train_padded, y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.2,
        callbacks=[reduce_lr, checkpoint],
        verbose=1
    )
except Exception as e:
    logger.error(f"Error occured during training: {str(e)}")
    raise

Epoch 1/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 508ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan - learning_rate: 0.0010
Epoch 2/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 553ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan - learning_rate: 0.0010
Epoch 3/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 585ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan - learning_rate: 0.0010
Epoch 4/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 566ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan - learning_rate: 2.0000e-04
Epoch 5/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 537ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan - learning_rate: 2.0000e-04
Epoch 6/15
[1m250/250[0m [32m

# Evaluation

In [None]:
# Loss & Accuracy plot to evaluate learning process
plt.figure(figsize=(12, 4))

#  Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Curve')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

#  Accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Curve')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.savefig('sentiment140_metrics.png')

In [43]:
# Evaluating the model
# Test trained model on test set
# Input: X_test_padded ==> [20000, 30]
# Output: y_test ==> [, 20000]
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


Test Loss: nan, Test Accuracy: 0.0000


In [None]:
# Predicting with model
# Sample tweet
sample_tweet = X_test[:1]  # متن خام
sample_seq = tokenizer.texts_to_sequences(sample_tweet)
sample_padded = pad_sequences(sample_seq, maxlen=MAX_LENGTH, padding='pre', truncating='pre')

# sample_padded ==> [1, 30]
prediction = model.predict(sample_padded)
print(f"Tweet: {sample_tweet[0]}")
print(f"Prediction: {'Positive' if prediction[0] > 0.5 else 'Negative'}")
print(f"True Label {'Positive' if y_test[0] == 1 else 'Negative'}")