# Sentiment analysis with VADER

Using the VADER sentiment analysis tool on a dataset.

Written by Luc Bijl.

Importing relevant packages and retrieving IMDB testing dataset and taking a sample.

In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
import random
import re
import nltk
import pandas as pd

test_dataset = "../datasets/aciimdb/test"

reviews = []
labels = []

for sentiment in ['pos','neg']:
    sentiment_dir = os.path.join(test_dataset,sentiment)

    for filename in os.listdir(sentiment_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(sentiment_dir,filename),'r',encoding='utf-8') as file:
                review = file.read()
                sentiment_score = int(filename[:-4].split('_')[1])

                labels.append(sentiment_score)
                reviews.append(review)

data = {'Review': reviews, 'Sentiment': labels}
df_data = pd.DataFrame(data)

df_data

Unnamed: 0,Review,Sentiment
0,Alex North (John Cassavetes) has problems in r...,7
1,"I won't go to a generalization, and say it's t...",10
2,Movie about two Australian girls--Debbie (Nell...,7
3,A bland title disguises this solidly-carpenter...,7
4,"I was laying in bed, flicking through the chan...",8
...,...,...
24995,The first 2/3 of this film wasn't that dissimi...,2
24996,the movie is simply horrible (2/10). Although ...,2
24997,I don't recommend watching this movie. It's a ...,1
24998,*** Possable spoiler but probably not ***<br /...,3


Creating a testing sample and normalizing the sentiment to a range of -1 to 1.

In [2]:
samples = 50
df_test = df_data.sample(n=samples,random_state=42)
df_test.reset_index(drop=True,inplace=True)

def normalize(n):
    normal_n = (n - 5) / 5
    return normal_n

df_test['Normal sentiment'] = normalize(df_test['Sentiment'])

df_test.head()

Unnamed: 0,Review,Sentiment,Normal sentiment
0,"I can not say this movie was a hilarious, but ...",7,0.4
1,How do stories this bad get made. That's not a...,3,-0.4
2,The most beautiful film. If one is looking for...,10,1.0
3,This film was really terrible.<br /><br />Howe...,1,-0.8
4,"Writer & director Jay Andrews, a.k.a. Jim Wyno...",1,-0.8


Performing sentiment analysis.

In [3]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

df_test['VADER sentiment'] = df_test['Review'].apply(analyze_sentiment)

df_test.head()

Unnamed: 0,Review,Sentiment,Normal sentiment,VADER sentiment
0,"I can not say this movie was a hilarious, but ...",7,0.4,0.8321
1,How do stories this bad get made. That's not a...,3,-0.4,-0.9664
2,The most beautiful film. If one is looking for...,10,1.0,0.9944
3,This film was really terrible.<br /><br />Howe...,1,-0.8,0.9646
4,"Writer & director Jay Andrews, a.k.a. Jim Wyno...",1,-0.8,0.9853


In [4]:
df_test

Unnamed: 0,Review,Sentiment,Normal sentiment,VADER sentiment
0,"I can not say this movie was a hilarious, but ...",7,0.4,0.8321
1,How do stories this bad get made. That's not a...,3,-0.4,-0.9664
2,The most beautiful film. If one is looking for...,10,1.0,0.9944
3,This film was really terrible.<br /><br />Howe...,1,-0.8,0.9646
4,"Writer & director Jay Andrews, a.k.a. Jim Wyno...",1,-0.8,0.9853
5,I think together all the reviewers have captur...,10,1.0,0.9829
6,I came across this movie while channel surfing...,9,0.8,0.974
7,I initially tuned in to Paranormal State becau...,1,-0.8,-0.8487
8,"Like the first one,the team of JACKASS are bac...",7,0.4,-0.8377
9,The Coen's strike again. I had no presuppositi...,8,0.6,-0.0516


In [5]:
from scipy.stats import pearsonr


correlation, _ = pearsonr(df_test['VADER sentiment'],df_test['Normal sentiment'])

print(f"Sample correlation: {correlation:.2f}")

Sample correlation: 0.46
