In [1]:
# The sentiment analysis dataset is taken from kaggle. You can find it here https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset
import pandas as pd 
import numpy as np
data = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='ISO-8859-1')
data.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [2]:
data = data[['text', 'sentiment']]
_neutral_ = data[(data['sentiment']=='neutral')].index
data.drop(_neutral_, inplace=True)
data['sentiment'].value_counts()

positive    8582
negative    7781
Name: sentiment, dtype: int64

In [3]:
data.loc[data['sentiment']=='positive', 'sentiment_score'] = 1
data.loc[data['sentiment']=='negative', 'sentiment_score'] = 0
data.head()

Unnamed: 0,text,sentiment,sentiment_score
1,Sooo SAD I will miss you here in San Diego!!!,negative,0.0
2,my boss is bullying me...,negative,0.0
3,what interview! leave me alone,negative,0.0
4,"Sons of ****, why couldn`t they put them on t...",negative,0.0
6,2am feedings for the baby are fun when he is a...,positive,1.0


## Preprocessing

In [4]:
import re
def trim_length(text):
    '''
    This method removes the repeating characters that are repeated more than 2 times
    '''
    pat = re.compile(r"(.)\1{2,}")
    return pat.sub(r"\1\1", text)

def word_correct(myStr):
    '''
    This method accepts a string and removes the meta characters from it
    '''
    myStr = str(myStr)
    pat = re.compile(r'[^a-zA-Z1-9]+')
    corrected_str = ''
    splits = myStr.split()
    for word in splits:
        word = word.strip()
        word = re.sub(pat, '', word).lower()
        word = trim_length(word)
        corrected_str = corrected_str+word+' '
    corrected_str = corrected_str.strip()
    return corrected_str

In [5]:
data['text'] = data['text'].apply(lambda i:word_correct(i))

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_word(myStr):
    myStr = str(myStr)
    lemmatizer = WordNetLemmatizer()
    lst_word = nltk.word_tokenize(myStr)
    final_str=''
    for word in lst_word:
        word = lemmatizer.lemmatize(word, get_pos(word))
        final_str = final_str+word+' '

    final_str = final_str.strip()
    return final_str

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
data['text'] = data['text'].apply(lambda i: lemmatize_word(i))

In [8]:
data = data.replace(r'^s*$', float('NaN'), regex=True)
data = data.dropna()
X = data['text']
y = data['sentiment_score']
print(X.isna().sum())
print(y.isna().sum())

0
0


## VADER

In [9]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def vader_sentiment_result(data):
    data = str(data)
    scores = analyzer.polarity_scores(data)
    if scores["neg"] > scores["pos"]:
        return 0
    else:
        return 1

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
score_vader = X.apply(lambda i: vader_sentiment_result(i))

In [11]:
score_vader.value_counts()

1    11073
0     5289
Name: text, dtype: int64

## Flair

In [12]:
!pip install flair

[0m

In [13]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('en-sentiment')

def flair_sentiment_score(data):
    data = str(data)
    data = Sentence(data)
    classifier.predict(data)
    if(data.labels[0].to_dict()['value']=='NEGATIVE'):
        return 0
    else:
        return 1

2023-01-21 13:36:43,048 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


In [14]:
score_flair = X.apply(lambda i:flair_sentiment_score(i))

In [15]:
score_flair.value_counts()

1    8454
0    7908
Name: text, dtype: int64

## Comparison

In [17]:
from sklearn.metrics import accuracy_score
accu_score_vader = accuracy_score(y, score_vader)
print("Accuracy of VADER:", accu_score_vader)
accu_score_flair = accuracy_score(y, score_flair)
print("Accuracy of Flair:", accu_score_flair)

Accuracy of VADER: 0.7934849040459602
Accuracy of Flair: 0.8079696858574746
