<h1> Sentence-level Sentiment Analysis

<h4> Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import re

import warnings
warnings.filterwarnings('ignore')

from wordcloud import STOPWORDS
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, pipeline

pd.set_option('display.max_colwidth', None)




<h4> Loading raw dataset

In [2]:
df = pd.read_csv('../data/(A) data.csv')

In [3]:
df_clean = df.copy()

<h4> Indexing

In [4]:
df_clean.set_index('tweet_id', inplace=True)

<h4> Dropping unused columns

In [5]:
df_clean.drop(['airline_sentiment_gold', 'name', 'negativereason_gold', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone'], axis=1, inplace=True)

<h4> Renaming columns

In [6]:
df_clean.rename(columns={'airline_sentiment': 'given_sentiment',
                         'airline_sentiment_confidence': 'given_sentiment_confidence',
                         'negativereason': 'given_negative_reason',
                         'negativereason_confidence': 'given_negative_reason_confidence',
                         'text': 'tweet'
                        }, inplace=True)

<h4> Dropping duplicated rows

In [7]:
df_clean.drop_duplicates(inplace=True)

<h4> Dropping index

In [8]:
df_clean.reset_index(inplace=True, drop=True)

<h4> Type Conversion

In [9]:
df_clean['given_sentiment'] = df_clean['given_sentiment'].astype('string')
df_clean['given_negative_reason'] = df_clean['given_negative_reason'].astype('string')
df_clean['airline'] = df_clean['airline'].astype('string')
df_clean['tweet'] = df_clean['tweet'].astype('string')

<h4> Preserving letters

In [10]:
df_clean['tweet'] = df_clean['tweet'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

<h4> Decapitilization

In [11]:
df_clean['tweet'] = df_clean['tweet'].str.lower()

<h4> Removing stopwords

In [12]:
stopwords = set(STOPWORDS)
stopwords.update(['flight', 'will'])

def remove_stop_words(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stopwords]
    return ' '.join(filtered_tokens)

df_clean['tweet'] = df_clean['tweet'].apply(remove_stop_words)

<h4> Lemmatization

In [13]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

df_clean['tweet'] = df_clean['tweet'].apply(lemmatize_text)

<h4> SentimentIntensityAnalyzer

In [14]:
sia = SentimentIntensityAnalyzer()

def sia_sentiment(text):
    scores = sia.polarity_scores(text) 
    if scores['compound'] >= 0.05:
        sentiment = 'positive'
        confidence = scores['pos']
    elif scores['compound'] <= -0.05:
        sentiment = 'negative'
        confidence = scores['neg']
    else:
        sentiment = 'neutral'
        confidence = scores['neu']  
    return pd.Series([sentiment, confidence])  

df_clean[['sia_sentiment', 'sia_sentiment_confidence']] = df_clean['tweet'].apply(sia_sentiment)

<h4> BERT

In [15]:
bert_model_name = "finiteautomata/bertweet-base-sentiment-analysis"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name)
bert_sentiment_model = pipeline("sentiment-analysis", model=bert_model, tokenizer=bert_tokenizer)
bert_label_mapping = {
    'POS': 'positive',
    'NEG': 'negative',
    'NEU': 'neutral'
}

def bert_sentiment(text):
    result = bert_sentiment_model(text)[0]
    sentiment = bert_label_mapping.get(result['label'], result['label']).lower()
    confidence = result['score']
    return pd.Series([sentiment, confidence])

df_clean[['bert_sentiment', 'bert_sentiment_confidence']] = df_clean['tweet'].apply(bert_sentiment)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


<h4> roBERTa

In [16]:
roberta_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_model_name)
roberta_model = RobertaForSequenceClassification.from_pretrained(roberta_model_name)
roberta_sentiment_model = pipeline("sentiment-analysis", model=roberta_model, tokenizer=roberta_tokenizer)

def roberta_sentiment(text):
    result = roberta_sentiment_model(text)[0]
    sentiment = result['label']
    confidence = result['score']
    return pd.Series([sentiment, confidence])

df_clean[['roberta_sentiment', 'roberta_sentiment_confidence']] = df_clean['tweet'].apply(roberta_sentiment)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<h4> Multilingual distilBERT

In [17]:
distilbert_model_name = "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
distilbert_tokenizer = AutoTokenizer.from_pretrained(distilbert_model_name)
distilbert_model = AutoModelForSequenceClassification.from_pretrained(distilbert_model_name)
distilbert_sentiment_model = pipeline("sentiment-analysis", model=distilbert_model, tokenizer=distilbert_tokenizer)

def distilbert_sentiment(text):
    result = distilbert_sentiment_model(text)[0]
    sentiment = result['label']
    confidence = result['score']
    return pd.Series([sentiment, confidence])

df_clean[['distilbert_sentiment', 'distilbert_sentiment_confidence']] = df_clean['tweet'].apply(distilbert_sentiment)

<h4> Comparison of Sentiment Confidence

In [18]:
df_clean[['given_sentiment_confidence', 'sia_sentiment_confidence', 'bert_sentiment_confidence', 'roberta_sentiment_confidence', 'distilbert_sentiment_confidence']].describe()

Unnamed: 0,given_sentiment_confidence,sia_sentiment_confidence,bert_sentiment_confidence,roberta_sentiment_confidence,distilbert_sentiment_confidence
count,14584.0,14584.0,14584.0,14584.0,14584.0
mean,0.899786,0.482629,0.874637,0.779011,0.575477
std,0.163025,0.288256,0.13526,0.139074,0.152964
min,0.335,0.0,0.351074,0.347661,0.335754
25%,0.6922,0.25775,0.816801,0.676624,0.455013
50%,1.0,0.384,0.940087,0.812208,0.531459
75%,1.0,0.636,0.971652,0.894637,0.672984
max,1.0,1.0,0.993168,0.989277,0.986638


-> None of the experimented approach yields a higher mean sentiment confidence than the given labels.

-> BERT's performance is quite close to the given model (assuming it's from a model)

<h4> Ensemble (0.5*Given + 0.3*BERT + 0.2*roBERTa)

In [20]:
weights = {
    'given': 0.5,
    'bert': 0.3,
    'roberta': 0.2
}

def weighted_ensemble(row):
    weighted_confidence = {
        'positive': 0,
        'neutral': 0,
        'negative': 0
    }
    
    weighted_confidence[row['given_sentiment']] += weights['given'] * row['given_sentiment_confidence']
    weighted_confidence[row['bert_sentiment']] += weights['bert'] * row['bert_sentiment_confidence']
    weighted_confidence[row['roberta_sentiment']] += weights['roberta'] * row['roberta_sentiment_confidence']
    final_sentiment = max(weighted_confidence, key=weighted_confidence.get)
    confidences = {
        'given': row['given_sentiment_confidence'] if row['given_sentiment'] == final_sentiment else 0,
        'bert': row['bert_sentiment_confidence'] if row['bert_sentiment'] == final_sentiment else 0,
        'roberta': row['roberta_sentiment_confidence'] if row['roberta_sentiment'] == final_sentiment else 0
    }
    final_confidence = max(confidences.values())

    return pd.Series([final_sentiment, final_confidence])

df_clean[['ensemble_sentiment', 'ensemble_sentiment_confidence']] = df_clean.apply(weighted_ensemble, axis=1)

In [21]:
df_clean[['given_sentiment_confidence', 'ensemble_sentiment_confidence']].describe()

Unnamed: 0,given_sentiment_confidence,ensemble_sentiment_confidence
count,14584.0,14584.0
mean,0.899786,0.959044
std,0.163025,0.090851
min,0.335,0.3503
25%,0.6922,0.971787
50%,1.0,1.0
75%,1.0,1.0
max,1.0,1.0


In [22]:
df_clean[df_clean['given_sentiment'] != df_clean['ensemble_sentiment']][['tweet', 'given_sentiment', 'ensemble_sentiment']]

Unnamed: 0,tweet,given_sentiment,ensemble_sentiment
1,virginamerica plus youve added commercial experience tacky,positive,negative
2,virginamerica didnt today must mean need take another trip,neutral,negative
6,virginamerica yes nearly every time fly vx ear worm wont go away,positive,negative
7,virginamerica really missed prime opportunity men without hat parody httpstcomwpggrezp,neutral,negative
10,virginamerica know suicide second leading cause death among teen,neutral,negative
...,...,...,...
14509,americanair cant ahold aadvantage reservation need ticket reservation cancelled flight soon help,neutral,negative
14531,americanair ill play ear know best buy chewey oatmeal cooky customer care folk,negative,neutral
14539,americanair shannonbloom wheres dm wheres voucher who paying cab car back jfk tomorrow,negative,neutral
14576,americanair tilleymonsta george doesnt look good please follow link start refund process httptcogrsdl,neutral,negative


<h4> Save pre-processed dataset

In [25]:
df_clean.to_csv('../data/(B) sentence_level_full_data.csv', index=False)

In [24]:
df_final = df_clean.copy()
df_final.drop(['given_sentiment', 'given_sentiment_confidence', 'sia_sentiment', 'sia_sentiment_confidence',
       'bert_sentiment', 'bert_sentiment_confidence', 'roberta_sentiment',
       'roberta_sentiment_confidence', 'distilbert_sentiment',
       'distilbert_sentiment_confidence'], axis = 1, inplace = True)
df_final.to_csv('../data/(C) sentence_level_final_data.csv', index=False)

In [None]:
df_class = df_final[['tweet', 'ensemble_sentiment']]
df_class.to_csv('../data/(D) classification.csv', index=False)