In [16]:
import pandas as pd
from sklearn.metrics import accuracy_score
import spacy
from afinn import Afinn
import time
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn

# # Ensure you have the necessary NLTK datasets downloaded
# nltk.download('sentiwordnet')
# nltk.download('omw-1.4')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')
# nltk.download('punkt')  # For tokenization
# nltk.download('averaged_perceptron_tagger')  # For POS tagging

!python3 -m nltk.downloader -d /Users/moiz/nltk_data sentiwordnet
!python3 -m nltk.downloader -d /Users/moiz/nltk_data omw-1.4
!python3 -m nltk.downloader -d /Users/moiz/nltk_data wordnet
!python3 -m nltk.downloader -d /Users/moiz/nltk_data vader_lexicon
!python3 -m nltk.downloader -d /Users/moiz/nltk_data punkt
!python3 -m nltk.downloader -d /Users/moiz/nltk_data averaged_perceptron_tagger

nltk.data.path.append('/Users/moiz/nltk_data')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/moiz/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/moiz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/moiz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/moiz/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/moiz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/moiz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [17]:
# Load spacy's English language model
nlp = spacy.load('en_core_web_sm')

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    doc = nlp(text)
    clean_tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return ' '.join(clean_tokens)

# Load dataset
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

# Convert sentiments to numerical format
y_train = train_df['sentiment'].map({'positive': 2, 'negative': 1}).astype(int)
y_test = test_df['sentiment'].map({'positive': 2, 'negative': 1}).astype(int)

# Define X_test
X_test = test_df['review'] 

# Improved SentiWordNet analysis considering POS tags
def get_wordnet_pos(spacy_token):
    if spacy_token.pos_ in ('NOUN', 'PROPN'):
        return wn.NOUN
    elif spacy_token.pos_ == 'VERB':
        return wn.VERB
    elif spacy_token.pos_ == 'ADJ':
        return wn.ADJ
    elif spacy_token.pos_ == 'ADV':
        return wn.ADV
    else: 
        return None

In [18]:
def sentiwordnet_sentiment(review):
    review = preprocess_text(review)
    tokens = nlp(review)
    pos_score = neg_score = 0
    for token in tokens:
        wn_tag = get_wordnet_pos(token)
        if wn_tag:
            synsets = list(swn.senti_synsets(token.text, wn_tag))
            if synsets:
                synset = synsets[0]
                pos_score += synset.pos_score()
                neg_score += synset.neg_score()
    return 2 if pos_score > neg_score else 1

# Afinn sentiment analysis
afn = Afinn()
def afinn_sentiment(review):
    review = preprocess_text(review)
    score = afn.score(review)
    return 2 if score > 0 else 1

# VADER sentiment analysis with adjusted threshold
analyzer = SentimentIntensityAnalyzer()
def vader_sentiment(review):
    review = preprocess_text(review)
    scores = analyzer.polarity_scores(review)
    return 2 if scores['compound'] > 0.05 else 1

In [19]:
# Applying SentiWordNet sentiment analysis
start = time.time()
y_pred_swn = [sentiwordnet_sentiment(review) for review in X_test]
end = time.time()
accuracy_swn = accuracy_score(y_test, y_pred_swn)
print(f'SentiWordNet Accuracy: {accuracy_swn}')
print(f'SentiWordNet Time: {end-start} seconds')

SentiWordNet Accuracy: 0.66455
SentiWordNet Time: 1331.7760210037231 seconds


In [20]:
# Applying Afinn sentiment analysis
start = time.time()
y_pred_afn = [afinn_sentiment(review) for review in X_test]
end = time.time()
accuracy_afn = accuracy_score(y_test, y_pred_afn)
print(f'Afinn Accuracy: {accuracy_afn}')
print(f'Afinn Time: {end-start} seconds')

Afinn Accuracy: 0.7148
Afinn Time: 985.3021380901337 seconds


In [21]:
# Applying VADER sentiment analysis
start = time.time()
y_pred_vader = [vader_sentiment(review) for review in X_test]
end = time.time()
accuracy_vader = accuracy_score(y_test, y_pred_vader)
print(f'VADER Accuracy: {accuracy_vader}')
print(f'VADER Time: {end-start} seconds')

VADER Accuracy: 0.6772
VADER Time: 935.612398147583 seconds
