# Sentiment analysis with BERT

Using transformers with the distilled bert-base model on the IMDB dataset, to perform continuous score sentiment analysis.

Written by Luc Bijl.

Retrieving IMDB training and testing dataset from datasets directory.

In [1]:
import os
import pandas as pd

train_dataset = "../datasets/aciimdb/train"
test_dataset = "../datasets/aciimdb/test"

train_reviews = []
train_scores = []
test_reviews = []
test_scores = []

for dataset, reviews, scores in [(train_dataset, train_reviews, train_scores), (test_dataset, test_reviews, test_scores)]:
    for sentiment in ['pos','neg']:
        sentiment_dir = os.path.join(dataset,sentiment)

        for filename in os.listdir(sentiment_dir):
            if filename.endswith('.txt'):
                with open(os.path.join(sentiment_dir,filename),'r',encoding='utf-8') as file:
                    review = file.read()
                    sentiment_score = int(filename[:-4].split('_')[1])

                    scores.append(sentiment_score)
                    reviews.append(review)

train_data = {'Review': train_reviews, 'Sentiment': train_scores}
test_data = {'Review': test_reviews, 'Sentiment': test_scores}

df_train_data = pd.DataFrame(train_data)
df_test_data = pd.DataFrame(test_data)

print("Train Data:")
print(df_train_data.head())
print("\nTest Data:")
print(df_test_data.head())

Train Data:
                                              Review  Sentiment
0  In my opinion, the best movie ever. I love whe...         10
1  I have seen The Running Man several times as I...          9
2  actually... that "video camera" effect, is jus...          8
3  The year 1995, when so many people talked abou...          9
4  Bravo! Morgan Freeman is an actor, who researc...         10

Test Data:
                                              Review  Sentiment
0  Alex North (John Cassavetes) has problems in r...          7
1  I won't go to a generalization, and say it's t...         10
2  Movie about two Australian girls--Debbie (Nell...          7
3  A bland title disguises this solidly-carpenter...          7
4  I was laying in bed, flicking through the chan...          8


Normalizing the training and testing dataset to a range of -1 to 1.

In [2]:
def normalize(n):
    normal_n = (n - 5) / 5
    return normal_n

df_train_data['Normal sentiment'] = normalize(df_train_data['Sentiment'])
df_test_data['Normal sentiment'] = normalize(df_test_data['Sentiment'])

print("Train Data:")
print(df_train_data.head())
print("\nTest Data:")
print(df_test_data.head())

Train Data:
                                              Review  Sentiment  \
0  In my opinion, the best movie ever. I love whe...         10   
1  I have seen The Running Man several times as I...          9   
2  actually... that "video camera" effect, is jus...          8   
3  The year 1995, when so many people talked abou...          9   
4  Bravo! Morgan Freeman is an actor, who researc...         10   

   Normal sentiment  
0               1.0  
1               0.8  
2               0.6  
3               0.8  
4               1.0  

Test Data:
                                              Review  Sentiment  \
0  Alex North (John Cassavetes) has problems in r...          7   
1  I won't go to a generalization, and say it's t...         10   
2  Movie about two Australian girls--Debbie (Nell...          7   
3  A bland title disguises this solidly-carpenter...          7   
4  I was laying in bed, flicking through the chan...          8   

   Normal sentiment  
0               

Creating a validation sample.

In [3]:
samples = 500
df_validation_data = df_test_data.sample(n=samples,random_state=42)
df_validation_data.reset_index(drop=True,inplace=True)

Preparing the data for BERT, this includes tokenization, encoding and creating dataloaders for both training and testing datasets.

In [4]:
import torch
from transformers import DistilBertTokenizerFast
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# Tokenize and encode the text data
train_encodings = tokenizer(df_train_data['Review'].tolist(), truncation=True, padding=True, return_tensors='pt')
validation_encodings = tokenizer(df_validation_data['Review'].tolist(), truncation=True, padding=True, return_tensors='pt')

# Create data loaders
train = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(df_train_data['Normal sentiment']))
train_dataloader = DataLoader(train, batch_size=16, shuffle=True)

validation = torch.utils.data.TensorDataset(validation_encodings['input_ids'], validation_encodings['attention_mask'], torch.tensor(df_validation_data['Normal sentiment']))
validation_dataloader = DataLoader(validation, batch_size=16, shuffle=False)

Defining the model: BERT.

In [5]:
model = DistilBertForSequenceClassification.from_pretrained(model_name,num_labels=1)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining the optimizer and loss function.

In [6]:
import torch
from torch.optim import AdamW
from torch.nn import MSELoss

optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = MSELoss()

Training loop. Here BERT will be trained with the training dataset.

In [7]:
from torch.utils.tensorboard import SummaryWriter

log_dir = 'logs'
writer = SummaryWriter(log_dir)
global_step = 0

num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        global_step += 1

        input_ids, attention_mask, scores = batch

        optimizer.zero_grad()
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_scores = output.logits.view(-1)

        loss = loss_fn(predicted_scores, scores.float())
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        writer.add_scalar('Loss-train', loss, global_step)

        model.eval()
        validation_total_loss = 0
        num_batches = 0

        for batch in validation_dataloader:
            with torch.no_grad():
                input_ids, attention_mask, scores = batch
                output = model(input_ids=input_ids, attention_mask=attention_mask)
                predicted_scores = output.logits.view(-1)

                loss = loss_fn(predicted_scores, scores.float())
                validation_total_loss += loss.item()
                num_batches += 1

        if num_batches > 0:
            validation_loss = validation_total_loss / num_batches
        else:
            validation_loss = 0.0   

        writer.add_scalar('Loss-validation', validation_loss,global_step)

        model.train()

writer.close()

KeyboardInterrupt: 

Creating a testing sample.

In [8]:
samples = 500
df_test_sample = df_test_data.sample(n=samples,random_state=42)
df_test_sample.reset_index(drop=True,inplace=True)

Performing sentiment analysis with BERT.

In [9]:
max_seq_length = 512
bert_sentiment_score = []

for index, row in df_test_sample.iterrows():
    text = row['Review']

    chunks = [text[i:i + max_seq_length] for i in range(0, len(text), max_seq_length)]
    chunk_scores = []

    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        chunk_scores.append(logits.item())

    combined_score = sum(chunk_scores) / len(chunk_scores)
    bert_sentiment_score.append(combined_score)

df_test_sample['BERT sentiment'] = bert_sentiment_score

df_test_sample.head()

Unnamed: 0,Review,Sentiment,Normal sentiment,BERT sentiment
0,"I can not say this movie was a hilarious, but ...",7,0.4,0.164758
1,How do stories this bad get made. That's not a...,3,-0.4,-0.457671
2,The most beautiful film. If one is looking for...,10,1.0,0.678035
3,This film was really terrible.<br /><br />Howe...,1,-0.8,-0.473068
4,"Writer & director Jay Andrews, a.k.a. Jim Wyno...",1,-0.8,-0.220075


Determining the accuracy of BERT.

In [10]:
from scipy.stats import pearsonr

correlation, _ = pearsonr(df_test_sample['BERT sentiment'],df_test_sample['Normal sentiment'])

print(f"Sample correlation: {correlation:.2f}")

Sample correlation: 0.78


Saving the model for later use.

In [11]:
torch.save(model, 'bert-imdb.pth')

A better method of evaluation. With as output both the MAE and R-value.

In [12]:
test_encodings = tokenizer(df_test_data['Review'].tolist(), truncation=True, padding=True, return_tensors='pt')

test = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(df_test_data['Normal sentiment']))
test_dataloader = DataLoader(test, batch_size=16, shuffle=False)

model.eval()
total_mae = 0
total_samples = 0
list_predicted_scores = []

with torch.no_grad():

    for batch in test_dataloader:
        input_ids, attention_mask, scores = batch
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_scores = output.logits.view(-1)

        list_predicted_scores.extend(predicted_scores.tolist())

        mae = torch.abs(predicted_scores - scores.float()).sum().item()
        total_mae += mae
        total_samples += scores.size(0)

mean_mae = total_mae / total_samples
print(f"Mean Absolute Error (MAE): {mean_mae:.4f}")

df_test_data['BERT sentiment'] = list_predicted_scores

correlation, _ = pearsonr(df_test_data['Normal sentiment'],df_test_data['BERT sentiment'])
print(f"Pearson Correlation Coefficient (R-value): {correlation:.4f}")

Mean Absolute Error (MAE): 0.2965
