# NLP Exercise 1: Sentiment Analysis
---

## Loading necessary libraries.

In [1]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from nltk.corpus import twitter_samples
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import numpy as np
import re
import nltk
import string
import pandas as pd
from nltk.corpus import twitter_samples, stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

##  Traditional Sentiment Analysis approach


### Preprocessing the dataset



Tokenization after normalization

In [2]:
# Import nltk samples, stopwords
nltk.download('twitter_samples')
nltk.download('stopwords')

# Read corpus package
print(twitter_samples.fileids())

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']


[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\vpming\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vpming\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
print(stopwords)

<WordListCorpusReader in 'C:\\Users\\vpming\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>


In [4]:
def process_tweet(tweet):
    """
    Process tweet function.
    
    Input:
        tweet: a string containing a tweet.
    Output:
        tweets_clean: a list of words containing the processed tweet.

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [5]:
# Extract positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

all_tweets = positive_tweets + negative_tweets

# Create labels: 1 for positive, 0 for negative
positive_labels = [1] * len(positive_tweets)
negative_labels = [0] * len(negative_tweets)

all_labels = positive_labels + negative_labels


Create DataFrame for visualization

In [6]:
def create_dataframe(tweets, labels):
    """
    Create DataFrame to visualize the dataset.
    
    Input:
        tweet: a list containing tweet texts.
        labels: list containing labels for each tweet (1 for positve, 0 for negative).
    Output:
        A DataFrame with 2 columns of tweets and labels.

    """
    if len(tweets) != len(labels):
        raise ValueError

    df = pd.DataFrame({'tweets': tweets, 'labels': labels})

    return df

In [7]:
# Create DataFrame for tweets and labels
df = create_dataframe(all_tweets, all_labels)
print(df.head())

                                              tweets  labels
0  #FollowFriday @France_Inte @PKuchly57 @Milipol...       1
1  @Lamb2ja Hey James! How odd :/ Please call our...       1
2  @DespiteOfficial we had a listen last night :)...       1
3                               @97sides CONGRATS :)       1
4  yeaaaah yippppy!!!  my accnt verified rqst has...       1


In [8]:
# Preparing data for the Word2Vec model
cleaned_tweets = []
for tweet in all_tweets:
    cleaned_tweet = process_tweet(tweet)
    cleaned_tweets.append(cleaned_tweet)

In [9]:
# Word Embeddings using Word2Vec model
word2vec_model = Word2Vec(cleaned_tweets, vector_size=20,
                          window=5, min_count=5, workers=4)

word_embeddings = word2vec_model.wv

In [11]:
# Example using word embeddings
print(word_embeddings['listen'])

[ 0.37327787  0.23462231  0.05815687  0.4243192  -0.16658585 -0.03578897
  0.34700608  0.8236061  -0.3567064   0.20356919  0.298717   -0.25387537
  0.5562919  -0.05993766  0.32543904  0.13163735  0.7409497  -0.2601367
 -0.11768366 -0.69812775]


### Create tweet embeddings

Since each tweet consists of multiple words, convert each tweet into a single vector representation by averaging the Word2Vec embeddings of all the words in the tweet. This averaged vector will represent the tweet in a fixed-dimensional space, suitable for input into a classifier.

In [12]:
def get_tweet_embedding(tweet_tokens, word2vec_model):
    """
    Generate the embedding for a tweet by averaging word vectors.
    
    Input: 
        tweet_tokens: a list of tokens from processed tweet.
        word2vec_model: a trained Word2Vec model that contains word embeddings.
    Output:
        tweet_embedding: a numpy array representing the averaged embedding 
                        vector for a tweet. The dimension of the array is 
                        equal to the vector_size of the Word2Vec model.

    """
    tweet_vecs = []

    for word in tweet_tokens:
        if word in word2vec_model.wv:
            tweet_vecs.append(word2vec_model.wv[word])

    if len(tweet_vecs) == 0:
        return np.zeros(word2vec_model.vector_size)

    tweet_embedding = np.mean(tweet_vecs, axis=0)

    return tweet_embedding

In [13]:
# Example for using get_tweet_embeddings function
tweet1 = df['tweets'][4]
tweet1_cleaned = process_tweet(tweet1)
print(tweet1_cleaned)
tweet1_embedding = get_tweet_embedding(tweet1, word2vec_model)
print(tweet1_embedding)

['yeaaah', 'yipppi', 'accnt', 'verifi', 'rqst', 'succeed', 'got', 'blue', 'tick', 'mark', 'fb', 'profil', ':)', '15', 'day']
[ 0.35383517  0.22469711  0.05685812  0.3446301  -0.12399756 -0.00763755
  0.27155995  0.654489   -0.33020473  0.19128002  0.24953935 -0.15964511
  0.48445526 -0.09397851  0.24069586  0.1165326   0.60909647 -0.20767407
 -0.16720642 -0.53667706]


Prepare and split the data into training and testing sets

In [14]:
# Convert labels to a numpy array
labels = np.array(all_labels)

# Generate embeddings for all tweets in the dataset
tweet_embeddings = np.array([get_tweet_embedding(
    tweet, word2vec_model) for tweet in cleaned_tweets])

# Split the dataset into training and test sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(
    tweet_embeddings, labels, test_size=0.2, random_state=42)


### Train Classifier

Train a Classifier using Logistic Regression as a baseline classifier, which works well for binary classification tasks like sentiment analysis

In [15]:
# Initialize and train the logistic regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(x_train, y_train)

# Predict on the test set
y_pred = clf.predict(x_test)

### Evaluate the model

In [16]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: ", accuracy)

# Detailed classification report
print(classification_report(y_test, y_pred))

Test Accuracy:  0.9185
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       988
           1       0.92      0.91      0.92      1012

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000



Example prediction by using the trained model

In [17]:
def predict_tweet(tweet, word2vec_model, clf):
    """
    Predict sentiment of a tweet using a trained Word2Vec model and classifier.
    
    Input:
        tweet: raw text to predict sentiment.
        word2vec_model: a trained Word2Vec model containing word embeddings.
        clf: a trained classifier for sentiment predictions.
    Output:
        Returns "Positive" if the predicted sentiment is positive, 
                otherwise returns "Negative".
                
    """
    processed_tweet = process_tweet(tweet)
    tweet_embedding = get_tweet_embedding(processed_tweet, word2vec_model)
    prediction = clf.predict([tweet_embedding])

    return "Positive" if prediction == 1 else "Negative"

In [18]:
# Example prediction
new_tweet = "I like to study NLP <3"
print("Sentiment: ", predict_tweet(new_tweet, word2vec_model, clf))

Sentiment:  Positive


## Multi-Layer Perceptrons

Multi-Layer Perceptrons (MLP) can enhance the predictive power of sentiment analysis model by allowing it to capture more complex patterns in the data.

### Outline for building a Multi-Layer Perceptrons for Sentiment Analysis

#### 1. Data Preparation: 
- Use the embeddings generated for each tweet as input features for the MLP.

#### 2. Model Architecture:
- Design a simple MLP with several fully connected layers (dense layers), an activation function (ReLu) for non-linearity, and dropout layers to prevent overfitting.

- Use a final ouput layer with a sigmoid activation function for binary classification.

#### 3. Training and Evaluation:
- Train the MLP on the training set, validate on the test set, and evaluate performance using accuracy and a classification report.

#### 4. Hyperparameter Tuning:
- Experiment with the number of layers, number of neurons, dropout rates, and learning rate to optimize performance.

### Import necessary libraries

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

### Implement the Multi-Layer Perceptron (MLP) class in PyTorch

In [21]:
class SentimentMLP(nn.Module):
    def __init__(self, embedding_dim: int, 
                 hidden_dim1: int, 
                 hidden_dim2: int, 
                 dropout: float):
        """
        Initialize the Multi-Layer Perceptrons (MLP) model for sentiment analysis.

        Parameters:
            input_dim (int): dimension of the input features (tweet embedding size).
            hidden_dim1 (int): Number of neurons in the first hidden layer.
            hidden_dim2 (int): Number of neurons in the second hidden layer.
            dropout_rate (float): Dropout rate to prevent overfitting.

        """
        super(SentimentMLP, self).__init__()

        # Using nn.Sequential to stack layers
        self.network = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim1),  # First hidden layer
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim1, hidden_dim2),  # Second hidden layer
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim2, 1),  # Output layer
            nn.Sigmoid()  # Sigmoid activation for binary classification
        )
    
    def forward(self, x):
        return self.network(x)

### Prepare the data

In [22]:
X = np.array(tweet_embeddings)  # tweet_embeddings generated from Word2Vec
y = np.array(labels)  # labels for the tweets (1 for positive, 0 for negative)

# Convert data to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

# Split the data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
# Initialize the parameters for the model
embedding_dim = X_train.shape[1]
hidden_dim1 = 64
hidden_dim2 = 32
dropout_rate = 0.3

Initialize the model

In [24]:
mlp_model = SentimentMLP(
    embedding_dim=embedding_dim,
    hidden_dim1=hidden_dim1,
    hidden_dim2=hidden_dim2,
    dropout=dropout_rate
)

Implementation for training stage

In [25]:
def train(model, x_train, y_train, criterion, optimizer, num_epochs, batch_size, print_every):
    """
    Train a model with the given dataset, loss function, and optimizer.

    Parameters:
        model: the neural network model to train.
        x_train: training features.
        y_train: training labels.
        criterion: loss function.
        optimizer: optimizer to update model parameters.
        num_epochs: number of epochs to train.
        batch_size: size of each batch to training.
        print_every: frequency of printing loss (e.g, every 5 epochs)
    
    Returns:
        A list of loss values for each epoch.
        
    """

    epoch_losses = []
    
    model.train()

    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        permutation = torch.randperm(x_train.size(0))
        epoch_loss = 0

        # Mini-batch training
        for i in range(0, x_train.size(0), batch_size):
            # Select mini-batch
            index = permutation[i:i+ batch_size]
            batch_x, batch_y = x_train[index], y_train[index]

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(batch_x).squeeze()
            loss = criterion(outputs, batch_y)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Accumulate loss for each epoch
            epoch_loss += loss.item()
        
        # Average loss for each epoch
        avg_epoch_loss = epoch_loss / len(permutation)
        epoch_losses.append(avg_epoch_loss)

        # Print progress for each epoch
        if (epoch + 1) % print_every == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}, Loss: {avg_epoch_loss}]")
    
    return model, epoch_losses

Define the loss function and optimizer

In [26]:
criterion = nn.BCELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001, weight_decay=1e-5)

Train model

In [27]:
model, losses = train(mlp_model, X_train, y_train, criterion, optimizer, num_epochs=20, batch_size=64, print_every=5)

Epoch [5/20, Loss: 0.004931228272616863]
Epoch [10/20, Loss: 0.003983619369566441]
Epoch [15/20, Loss: 0.003323741973377764]
Epoch [20/20, Loss: 0.0029450216004624965]


Evaluate the model

In [28]:
def evaluate(model, x_test, y_test):
    """
    Evaluate the model on the test dataset and print performance metrics.
    
    Parameters:
        model (nn.Module): Trained model to evaluate.
        X_test (torch.Tensor): Test features.
        y_test (torch.Tensor): Test labels.
        
    Returns:
        Accuracy of the model on the test set.

    """

    model.eval()

    with torch.no_grad():
        outputs = model(x_test)
        predictions = (outputs > 0.5).int()

    accuracy = accuracy_score(y_test, predictions)

    print('Test Accuracy:', accuracy)
    print(classification_report(y_test, predictions))

In [29]:
evaluate(model, X_test, y_test)

Test Accuracy: 0.925
              precision    recall  f1-score   support

         0.0       0.92      0.93      0.92       988
         1.0       0.93      0.92      0.93      1012

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.92      2000
weighted avg       0.93      0.93      0.93      2000



Predict Sentiment for new data

In [30]:
def preprocess_text(text):
    """
    Preprocessing the new data using process tweet function and Word2Vec model defined above.
    
    """
    processed_tokens = process_tweet(text)
    
    tweet_embedding = get_tweet_embedding(processed_tokens, word2vec_model)
    
    # Convert to a tensor and reshape to match the model's expected input shape
    tweet_embedding_tensor = torch.tensor(tweet_embedding, dtype=torch.float32).unsqueeze(0)
    return tweet_embedding_tensor

In [31]:
def predict_sentiment(model, text):
    """
    Predict the sentiment of a given text using the trained model.
    
    Parameters:
        model: the trained MLP model.
        text: input text to analyze.
    
    Returns:
        "Positive" if sentiment is positive, otherwise "Negative".
    """
    # Set the model to evaluation mode
    model.eval()
    
    # Preprocess the text and get the embedding
    input_tensor = preprocess_text(text)
    
    # Disable gradient computation for inference
    with torch.no_grad():
        output = model(input_tensor)  # Model outputs probability due to Sigmoid activation
    
    # Interpret the output
    prediction = (output.item() > 0.5)  # Threshold at 0.5 for binary classification
    sentiment = "Positive" if prediction else "Negative"
    
    return sentiment

In [32]:
# Example text to analyze
new_text = "Oh great, it's raining again!"

In [33]:
# Predict the sentiment
sentiment = predict_sentiment(model, new_text)
print(sentiment)

Positive
