# Sentiment analysis with openAI

Using the openai API to perform sentiment analysis on a dataset.

Written by Luc Bijl.

Importing relevant packages and retrieving openAI api key from credentials file.

In [1]:
import os
import random
import re
import openai
import pandas as pd

openai_key = None

with open("../.credentials", "r") as file:
    for line in file:
        if 'openai-key' in line:
            openai_key = line.split('openai-key=')[1].strip()
            break

if openai_key is not None:
    openai.api_key = openai_key
else:
    print('Could not retrieve openAI key, do you have it in your credentials file?')

Retrieving IMDB testing dataset and taking a sample.

In [2]:
test_dataset = "../datasets/aciimdb/test"

reviews = []
labels = []

for sentiment in ['pos','neg']:
    sentiment_dir = os.path.join(test_dataset,sentiment)

    for filename in os.listdir(sentiment_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(sentiment_dir,filename),'r',encoding='utf-8') as file:
                review = file.read()
                sentiment_score = int(filename[:-4].split('_')[1])

                labels.append(sentiment_score)
                reviews.append(review)

data = {'Review': reviews, 'Sentiment': labels}
df_data = pd.DataFrame(data)

df_data

Unnamed: 0,Review,Sentiment
0,Alex North (John Cassavetes) has problems in r...,7
1,"I won't go to a generalization, and say it's t...",10
2,Movie about two Australian girls--Debbie (Nell...,7
3,A bland title disguises this solidly-carpenter...,7
4,"I was laying in bed, flicking through the chan...",8
...,...,...
24995,The first 2/3 of this film wasn't that dissimi...,2
24996,the movie is simply horrible (2/10). Although ...,2
24997,I don't recommend watching this movie. It's a ...,1
24998,*** Possable spoiler but probably not ***<br /...,3


Creating a testing sample.

In [3]:
samples = 50
df_test = df_data.sample(n=samples,random_state=42)
df_test.reset_index(drop=True,inplace=True)

df_test.head()

Unnamed: 0,Review,Sentiment
0,"I can not say this movie was a hilarious, but ...",7
1,How do stories this bad get made. That's not a...,3
2,The most beautiful film. If one is looking for...,10
3,This film was really terrible.<br /><br />Howe...,1
4,"Writer & director Jay Andrews, a.k.a. Jim Wyno...",1


Normalizing the sentiment to a range of -1 to 1.

In [4]:
def normalize(n):
    normal_n = (n - 5) / 5
    return normal_n

df_test['Normal sentiment'] = normalize(df_test['Sentiment'])

df_test.head()

Unnamed: 0,Review,Sentiment,Normal sentiment
0,"I can not say this movie was a hilarious, but ...",7,0.4
1,How do stories this bad get made. That's not a...,3,-0.4
2,The most beautiful film. If one is looking for...,10,1.0
3,This film was really terrible.<br /><br />Howe...,1,-0.8
4,"Writer & director Jay Andrews, a.k.a. Jim Wyno...",1,-0.8


Performing sentiment analysis.

In [11]:
def analyze_sentiment(text):
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=f"Please provide only a sentiment score between -1 (most negative) and 1 (most positive) for the following text: '{text}'",
        temperature=0.5,
        max_tokens=10
    )
    
    sentiment_score_match = re.search(r'-?\d+(\.\d+)?', response.choices[0].text)
    
    if sentiment_score_match:
        sentiment_score = float(sentiment_score_match.group())

        if sentiment_score >= -1 and sentiment_score <= 1:
            return sentiment_score
        else:
            return None

    else:
         return None

df_test['AI sentiment'] = df_test['Review'].apply(analyze_sentiment)

df_test.head()

Unnamed: 0,Review,Sentiment,Normal sentiment,AI sentiment
0,"I can not say this movie was a hilarious, but ...",7,0.4,0.6
1,How do stories this bad get made. That's not a...,3,-0.4,-0.8
2,The most beautiful film. If one is looking for...,10,1.0,1.0
3,This film was really terrible.<br /><br />Howe...,1,-0.8,0.5
4,"Writer & director Jay Andrews, a.k.a. Jim Wyno...",1,-0.8,-1.0


In [12]:
df_test

Unnamed: 0,Review,Sentiment,Normal sentiment,AI sentiment
0,"I can not say this movie was a hilarious, but ...",7,0.4,0.6
1,How do stories this bad get made. That's not a...,3,-0.4,-0.8
2,The most beautiful film. If one is looking for...,10,1.0,1.0
3,This film was really terrible.<br /><br />Howe...,1,-0.8,0.5
4,"Writer & director Jay Andrews, a.k.a. Jim Wyno...",1,-0.8,-1.0
5,I think together all the reviewers have captur...,10,1.0,1.0
6,I came across this movie while channel surfing...,9,0.8,0.75
7,I initially tuned in to Paranormal State becau...,1,-0.8,-1.0
8,"Like the first one,the team of JACKASS are bac...",7,0.4,1.0
9,The Coen's strike again. I had no presuppositi...,8,0.6,0.8


Determining accuracy of the sentiment analysis.

In [13]:
from scipy.stats import pearsonr

df_test_clean = df_test.dropna()

correlation, _ = pearsonr(df_test_clean['AI sentiment'],df_test_clean['Normal sentiment'])

print(f"Sample correlation: {correlation:.2f}")

Sample correlation: 0.87
