# Sentiment Analysis with transformers

Here we perform sentiment analysis on our essays dataset using transformer models available on Hugging Face.

In [1]:
import numpy as np
import pandas as pd

import torch
import transformers
import torch.nn.functional as F

from transformers import pipeline

In [46]:
# Read data
essays = pd.read_csv('essays.csv')

# Create a column for essay lenght
def calculate_length(wordlist):
    return len(wordlist)

essays['Essay Length'] = essays['Preprocessed Text'].apply(calculate_length)
essays.head()

# Is there an outlier or a too large value in the problematic batch
max10 = np.max(essays['Essay Length'][1001:1500])

# 1001 to 1500 too large sequence of words?
sum00 = np.sum(essays['Essay Length'][:500])
sum05 = np.sum(essays['Essay Length'][501:1000])
sum10 = np.sum(essays['Essay Length'][1001:1500])
# there seems to be no problem with sequence size in batch10

# Drop length column
essays.drop('Essay Length', axis=1)

Unnamed: 0,ID,Essay Text,Preprocessed Text
0,N28280Y,"I am happily married, we are grand-parents. Ou...","['happily', 'married', 'grandparent', 'two', '..."
1,N13960Q,"I am retired, not living in London, probably i...","['retired', 'living', 'london', 'probably', 'n..."
2,N23786Z,I imagine I'll still be teaching french at Pri...,"['imagine', 'ill', 'still', 'teaching', 'frenc..."
3,N17606R,I am retired from work. I enjoy leisurely time...,"['retired', 'work', 'enjoy', 'leisurely', 'tim..."
4,N19466F,"Retired and moved further away from London, Su...","['retired', 'moved', 'away', 'london', 'sussex..."
...,...,...,...
14747,N11272P,STILL LIVING WITH ALL THE FAMILY MEMBERS AROUN...,"['still', 'living', 'family', 'member', 'aroun..."
14748,N11272P,My interests are my husband and children. They...,"['interest', 'husband', 'child', 'always', 'pr..."
14749,N11272P,"My family & friends are very important to me, ...","['family', 'friend', 'important', 'hopefuly', ..."
14750,N11272P,When I do reach 60yrs I hope my health is stil...,"['reach', '60yrs', 'hope', 'health', 'still', ..."


In [3]:
# Instantiate a sentiment analysis pipeline
sentiment_classifier = pipeline('sentiment-analysis', 
                                model="siebert/sentiment-roberta-large-english")

# Sample sentences to try the model out
res = sentiment_classifier(['I like it.', 
                           'We hope you do not hate it.'])

for re in res:
    print(f"Sentiment: {re['label']}")
    print(f"Score: {re['score']}")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Sentiment: POSITIVE
Score: 0.9988269209861755
Sentiment: POSITIVE
Score: 0.9900906085968018


In [47]:
# Perform sentiment analysis on the 'text_column' column (in batches of 500 observations)
results = sentiment_classifier(essays['Preprocessed Text'][:500].tolist())

# indexing with pandas: [a:b], a/excluding, b/including

In [48]:
results05 = sentiment_classifier(essays['Preprocessed Text'][500:1000].tolist())

In [None]:
# Perform sentiment analysis and print a sample of results
samples = sentiment_classifier(essays['Preprocessed Text'][11:20].tolist())

# Print first few observations
for sample in samples:
    print(f"Sentiment: {sample['label']}")
    print(f"Score: {sample['score']}")
    print()
    
len(samples)

### Add the sentiment analysis results into our essays dataset

In [49]:
# Initiliaze empty columns for sentiment analysis results
essays['Sentiment'] = ''
essays['Score'] = ''

In [50]:
# Add the sentiment analysis results into our dataset (batch 00)
sentiments = [result['label'] for result in results]
scores = [result['score'] for result in results]

# Fill in the results columns with sentiment and confidence score
essays.loc[:499, 'Sentiment'] = sentiments
essays.loc[:499, 'Score'] = scores

In [52]:
# Add the sentiment analysis results into our dataset (batch 05)
sentiments05 = [result['label'] for result in results05]
scores05 = [result['score'] for result in results05]

# Create columns with sentiment and confidence score
essays.loc[500:998, 'Sentiment'] = sentiments05[:499]
essays.loc[500:998, 'Score'] = scores05[:499]

In [53]:
# Frequencies of positive and negative sentiment
essays['Sentiment'][:999].value_counts(normalize=True)

Sentiment
POSITIVE    0.915916
NEGATIVE    0.084084
Name: proportion, dtype: float64

In [None]:
# Add the sentiment analysis results into our dataset (batch 10)
sentiments10 = [result['label'] for result in results10]
scores10 = [result['score'] for result in results10]

# Create columns with sentiment and confidence score
essays.loc['Sentiment'][1001:1500] = sentiments10
essays.loc['Score'][1001:1500] = scores10

In [54]:
# Save updated sentiment analysis results
essays_sentiment = essays.to_csv('essays_sentiment.csv', index=False)