In [2]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import re
import time

# Load and preprocess the data
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset = pd.read_csv('C:/Users/IDEH/Desktop/Sentimental/Data/Sentiment140.csv',
                      encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

# Map sentiment labels to integers (0: negative, 4: positive)
sentiment_map = {0: 0, 4: 1}  # 0: negative, 1: positive
dataset.loc[:, 'sentiment'] = dataset['sentiment'].map(sentiment_map)

# Preprocess the text data
def preprocess(textdata):
    processedText = []
    wordLemm = WordNetLemmatizer()
    urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern = '@[^\s]+'
    alphaPattern = "[^a-zA-Z0-9]"
    sequencePattern = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"

    for tweet in tqdm(textdata, desc="Processing tweets", unit="tweet"):
        tweet = tweet.lower()
        tweet = re.sub(urlPattern, ' URL', tweet)
        tweet = re.sub(userPattern, ' USER', tweet)
        tweet = re.sub(alphaPattern, " ", tweet)
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        tweetwords = ''
        for word in tweet.split():
            word = wordLemm.lemmatize(word)
            tweetwords += (word + ' ')
        processedText.append(tweetwords)

    return processedText

processed_text = preprocess(dataset['text'].values)

Processing tweets: 100%|██████████| 1600000/1600000 [04:32<00:00, 5877.98tweet/s]


In [3]:

# Tokenization and Padding
max_features = 10000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(processed_text)
X = tokenizer.texts_to_sequences(processed_text)
X = pad_sequences(X, maxlen=50)

# Split data
Y = dataset['sentiment'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [4]:
# Build model
model = Sequential()
model.add(Embedding(max_features, 128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001)

# Train model
history = model.fit(X_train, Y_train, epochs=10, batch_size=64, validation_split=0.1, callbacks=[early_stopping, reduce_lr])

Epoch 1/10

KeyboardInterrupt: 

In [None]:
# Evaluate model
Y_pred = model.predict(X_test)
Y_pred = np.round(Y_pred).astype(int).flatten()

accuracy = accuracy_score(Y_test, Y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(Y_test, Y_pred, average='binary')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Model Performance')
plt.xlabel('Epochs')
plt.ylabel('Accuracy/Loss')
plt.legend()
plt.show()


Processing tweets: 100%|████████████████████████████████████████████████| 1600000/1600000 [04:57<00:00, 5375.64tweet/s]
