## 1. CNN (with GloVe Embeddings)

In [13]:
#Dataframe
import pandas as pd

#Matplotlib.pyplot
import matplotlib.pyplot as plt

#Datasets
from datasets import load_dataset

#Scki-kit learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

#Preprocessing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


#Tensor flow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.initializers import Constant

In [31]:
import nltk as nltk
import numpy as np

In [16]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/katarinalitricin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/katarinalitricin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/katarinalitricin/nltk_data...


True

In [17]:
#loading the dataset
dataset = load_dataset("sentiment140", trust_remote_code = True)

Downloading builder script: 100%|██████████| 4.03k/4.03k [00:00<00:00, 7.34MB/s]
Downloading readme: 100%|██████████| 6.84k/6.84k [00:00<00:00, 9.60MB/s]
Downloading data: 100%|██████████| 81.4M/81.4M [00:14<00:00, 5.58MB/s]  
Generating train split: 100%|██████████| 1600000/1600000 [00:38<00:00, 41370.58 examples/s]
Generating test split: 100%|██████████| 498/498 [00:00<00:00, 36113.41 examples/s]


In [18]:
df = pd.DataFrame(dataset['train'])

In [19]:
df

Unnamed: 0,text,date,user,sentiment,query
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he can't update his Facebook by ...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,@Kenichan I dived many times for the ball. Man...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"@nationwideclass no, it's not behaving at all....",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
...,...,...,...,...,...
1599995,Just woke up. Having no school is the best fee...,Tue Jun 16 08:40:49 PDT 2009,AmandaMarie1028,4,NO_QUERY
1599996,TheWDB.com - Very cool to hear old Walt interv...,Tue Jun 16 08:40:49 PDT 2009,TheWDBoards,4,NO_QUERY
1599997,Are you ready for your MoJo Makeover? Ask me f...,Tue Jun 16 08:40:49 PDT 2009,bpbabe,4,NO_QUERY
1599998,Happy 38th Birthday to my boo of alll time!!! ...,Tue Jun 16 08:40:49 PDT 2009,tinydiamondz,4,NO_QUERY


In [20]:
# Rename 'sentiment' to 'target'
df = df.rename(columns={"sentiment": "target"})

# Drop rows with NaN values in 'target' and 'text'
df.dropna(subset=['target', 'text'], inplace=True)


In [21]:
# convert to binary classification
df['target'] = df['target'].apply(lambda x: 1 if x == 4 else 0)

In [22]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert text to lowercase and remove tags
    text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df["text"] = df["text"].apply(preprocess_text)

In [23]:
max_words = 10000
max_len = 100

In [24]:
#Splitting the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)


In [25]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

### GloVe embeddings

In [29]:
import urllib.request
import zipfile
import os

# Download the GloVe embeddings
url = "http://nlp.stanford.edu/data/glove.6B.zip"
file_name = "glove.6B.zip"

# Download the file
urllib.request.urlretrieve(url, file_name)

# Extract the zip file
with zipfile.ZipFile(file_name, 'r') as zip_ref:
    zip_ref.extractall(".")

# Optionally, remove the zip file to clean up
os.remove(file_name)

print("Download and extraction complete.")

Download and extraction complete.


 Load GloVe Embeddings into a Matrix:

Pre-trained GloVe embeddings capture semantic relationships between words based on vast amounts of text data, providing a richer starting point for the model compared to random initialization.

In [32]:
embedding_dim = 100
embedding_index = {}

# Make sure the 'glove.6B.100d.txt' file is in your working directory
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [33]:
max_len = 150  # Increased sequence length!!

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


In [34]:
# Model
model1 = Sequential([
    Embedding(max_words, embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_len, trainable=False),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=5),
    Dropout(0.2),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=5),
    Dropout(0.2),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model1.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 150, 100)          1000000   
                                                                 
 conv1d_2 (Conv1D)           (None, 146, 128)          64128     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 29, 128)           0         
 g1D)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 29, 128)           0         
                                                                 
 conv1d_3 (Conv1D)           (None, 25, 128)           82048     
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 5, 128)            0         
 g1D)                                                 

Training the cnn model with GloVe embeddings using 10 epochs:

In [35]:
# Callbacks for Early Stopping and Learning Rate Reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

# Continue training for more epochs
history = model1.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping, reduce_lr])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
# Evaluate the model after extended training
final_loss, final_accuracy = model1.evaluate(X_test_pad, y_test)
print(f'Final Test Accuracy: {final_accuracy}')

Final Test Accuracy: 0.7290499806404114


### Model architecture:

- Embedding Layer

The embedding layer transforms input words into dense vector representations using pre-trained GloVe embeddings. GloVe (Global Vectors for Word Representation) embeddings are pre-trained on large corpora and capture semantic relationships between words. Using these embeddings helps our model leverage this rich, pre-existing knowledge, improving its performance even with limited training data. The layer takes sequences of integers (each representing a word) as input and produces dense vector representations for each word in the sequence.

- Convolutional Layers

The convolutional layers apply convolution operations to capture local patterns and features in the text data. CNNs are effective at capturing local dependencies and n-gram features in text, making them suitable for text classification tasks. The choice of filters and kernel size is crucial:

-- Number of Filters (128): A larger number of filters can capture more diverse features. The choice of 128 filters balances computational efficiency with the ability to learn rich features.

-- Kernel Size (5): A kernel size of 5 allows the model to consider 5-gram sequences at a time, which helps in capturing meaningful patterns that span multiple words.

- Max Pooling Layers

Pooling layers reduce the dimensionality of the feature maps by taking the maximum value over a specified window. This helps in reducing the computation cost and controlling overfitting by providing a form of down-sampling. A pooling size of 2 is used to reduce the feature map size by half, which helps in reducing the computational load and retaining the most important features.

-- Global Max Pooling Layer

A global max pooling layer reduces each feature map to a single value by taking the maximum value. This is useful to get a fixed-length output regardless of the input size. It further downsamples the feature maps to a fixed-length vector, which is essential for feeding into dense layers.

- Dense Layers

Dense (fully connected) layers take the features extracted by the convolutional and pooling layers and perform the actual classification. Dense layers are used to combine the features learned by previous layers and make the final classification decision. We use a moderate number of units (128) to balance complexity and performance, and the ReLU (Rectified Linear Unit) activation function introduces non-linearity, enabling the model to learn more complex patterns.

- Dropout Layer

The dropout layer prevents overfitting by randomly setting a fraction of input units to 0 during training. Dropout is a regularization technique that helps prevent overfitting by ensuring the model does not rely too heavily on any single feature. A dropout rate of 0.5 is a common choice that balances regularization and performance.

- Output Layer

The output layer produces the final binary classification output. The sigmoid activation function is used, as it outputs a probability value between 0 and 1, which is suitable for binary classification tasks.