## Import Libraries

In [1]:
## Pre-processing Packages
import pandas as pd
import re
import nltk
import nlpaug.augmenter.word as naw
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# download necessary NLTK data (only need to run this once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/nnerella/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nnerella/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nnerella/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score,KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

# FOR VADER
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nnerella/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Import Data and Pre-Processing (DON'T RUN IT)

In [10]:
# load the data from the csv file
train_data = pd.read_csv("../../data/raw/reviews.csv")
# train_label = train_data["Sentiment"]
original_data = pd.DataFrame(train_data["Text"])

# define a function to preprocess the text data
def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    # remove non-alphabetic characters
    text = re.sub(r'[^a-z]', ' ', text)
    # tokenize the text into words
    tokens = word_tokenize(text)
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# remove the html symbol
def remove_html(text):
    regex = r"<[^>]+>"
    text_new = re.sub(regex, " ", text)
    return text_new

# apply the preprocessing function to the text data
train_data['Text'] = train_data['Text'].apply(remove_html)
train_data['Text'] = train_data['Text'].apply(preprocess_text)

# define an NLPAug data augmentation function
def augment_text(text):
    # define an augmentation method
    aug = naw.SynonymAug(aug_src='wordnet', lang='eng')
    # apply the augmentation method to the text
    augmented_text = aug.augment(text)
    return augmented_text

# apply the augmentation function to the preprocessed text data
train_data['Text'] = train_data['Text'].apply(augment_text)

# over sampling
ros = RandomOverSampler(sampling_strategy='minority')
X = train_data['Text'].values.reshape(-1, 1)
y = train_data['Sentiment']
X_resampled, y_resampled = ros.fit_resample(X, y)
train_data = pd.DataFrame({'Text': X_resampled.ravel(), 'Sentiment': y_resampled})

# save the data to a new csv file
train_data.to_csv("oversampling_reviews.csv", index=False)

## Features
# train a Word2Vec model on the preprocessed text data
word2vec_model = Word2Vec(train_data['Text'], min_count=1)

# create a function to generate the word embedding vectors for each sentence
def generate_word_embedding(sentence):
    # initialize an empty array for the sentence vector
    sentence_vector = []
    # loop through each word in the sentence
    for word in sentence:
        try:
            # add the vector representation of the word to the sentence vector
            word_vector = word2vec_model.wv[word]
            sentence_vector.append(word_vector)
        except KeyError:
            # ignore words that are not in the vocabulary
            pass
    # take the mean of the word vectors to get the sentence vector
    sentence_vector = np.mean(sentence_vector, axis=0)
    return sentence_vector

# apply the generate_word_embedding() function to the preprocessed text data
train_data['embedding'] = train_data['Text'].apply(generate_word_embedding)

# create a new DataFrame for the feature matrix
embedding_size = word2vec_model.vector_size
features_df = pd.DataFrame(train_data['embedding'].tolist(), columns=[f'embedding_{i}' for i in range(embedding_size)])

# perform PCA with n_components set to retain 98% of variance
pca_emb = PCA(n_components=0.98)
features_emb_pca = pca_emb.fit_transform(features_df)

# create a new DataFrame for the PCA features
pca_emb_cols = [f"PC_emb{i+1}" for i in range(features_emb_pca.shape[1])]
pca_df_emb = pd.DataFrame(features_emb_pca, columns=pca_emb_cols)


# create a TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer()

# fit and transform the vectorizer on the preprocessed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['Text'].apply(lambda x: ' '.join(x)))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_features_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)
## standardize the features
#scaler = StandardScaler()
#features_std = scaler.fit_transform(features)

# perform PCA with n_components set to retain 95% of variance
pca = PCA(n_components=0.95)
features_tfidf_pca = pca.fit_transform(tfidf_features_df)

# create a new DataFrame for the PCA features
pca_tfidf_cols = [f"PC_tfidf{i+1}" for i in range(features_tfidf_pca.shape[1])]
pca_df_tfidf = pd.DataFrame(features_tfidf_pca, columns=pca_tfidf_cols)

# add the TF-IDF features to the feature matrix DataFrame
features_df = pd.concat([pca_df_tfidf, pca_df_emb], axis=1)


# add the number of characters, number of words, and number of capital characters as features
features_df['num_characters'] = train_data['Text'].apply(lambda x: len(' '.join(x)))
features_df['num_words'] = train_data['Text'].apply(lambda x: len(x))

# add the common features from the features.csv file
features_df['num_sentences'] = original_data["Text"].apply(lambda s: s.count('.'))
features_df['num_question_marks'] = original_data["Text"].apply(lambda s: s.count('?'))
features_df['num_exclamation_marks'] = original_data["Text"].apply(lambda s: s.count('!'))
features_df['num_unique_words'] = train_data["Text"].apply(lambda x: len(set(x)))


# add the label column to the feature matrix DataFrame
label = features_df.columns
features_df['Sentiment'] = train_data['Sentiment']

# weight the negative sentiment samples by 1.5
features_df.loc[features_df['Sentiment'] == 'negative',label] *= 2


# save the feature matrix to a CSV file
# pca_df_emb.to_csv("pca_df_emb.csv", index=False)
# pca_df_tfidf.to_csv("pca_df_tfidf.csv", index=False)
features_df.to_csv("features.csv", index=False)

## Train-Test Split

In [3]:
# Features obtained after over-sampling and PCA
features_df = pd.read_csv("features.csv")

In [4]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8060 entries, 0 to 8059
Columns: 3340 entries, PC_tfidf1 to Sentiment
dtypes: float64(3336), int64(3), object(1)
memory usage: 205.4+ MB


In [5]:
label_map = {
    'positive': 1,
    'negative': 0,
}

features_df['sentiment_label'] = features_df['Sentiment'].map(label_map)

# X
train_cols = features_df.iloc[: , :-7]
# y
label_col = features_df['sentiment_label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_cols, label_col, test_size=0.3)

## Baseline: Naive Bayes' Classifier

In [7]:
NB = GaussianNB()
kf = KFold(n_splits=5)
score = cross_val_score(NB, train_cols, label_col,cv=kf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

Cross Validation Scores are [0.99937965 0.99813896 0.98511166 0.99441687 1.        ]
Average Cross Validation score :0.9954094292803969


In [8]:
NB.fit(X_train, y_train)
NB_pred= NB.predict(X_test)
#print(NB_pred)

print(accuracy_score(y_test, NB_pred))

0.9971050454921423


## VADER(Valence Aware Dictionary for Sentiment Reasoning)

#### Links for Reference
1. https://www.analyticsvidhya.com/blog/2022/10/sentiment-analysis-using-vader/#:~:text=VADER(%20Valence%20Aware%20Dictionary%20for,as%20either%20positive%20or%20negative.

2. https://stackoverflow.com/questions/45296897/is-there-a-way-to-improve-performance-of-nltk-sentiment-vader-sentiment-analyser

In [3]:
reviews_csv = pd.read_csv('../../data/raw/reviews.csv')

In [4]:
# Convert Sentiment into factor
# Positive: 1, Negative: 0

label_map = {
    'positive': 1,
    'negative': 0,
}
reviews_csv['sentiment_label'] = reviews_csv['Sentiment'].map(label_map)

#### Note: 
1. nltk+vader already do basic pre-processing (such as removing stop-words etc..)
2. Do not remove punctuation, as it helps improve score (Ex. ! -> +ve)

In [5]:
# Remove HTML characters
def remove_html(text):
    regex = r"<[^>]+>"
    text_new = re.sub(regex, " ", text)
    return text_new

reviews_csv['VADER_processed_Text'] = reviews_csv['Text'].apply(lambda para: remove_html(para))

In [6]:
reviews_csv

Unnamed: 0,Sentiment,Time,Text,sentiment_label,VADER_processed_Text
0,positive,18/6/21,This is a very healthy dog food. Good for thei...,1,This is a very healthy dog food. Good for thei...
1,positive,7/7/21,I've been very pleased with the Natural Balanc...,1,I've been very pleased with the Natural Balanc...
2,positive,18/6/21,"Before I was educated about feline nutrition, ...",1,"Before I was educated about feline nutrition, ..."
3,positive,7/7/21,"My holistic vet recommended this, along with a...",1,"My holistic vet recommended this, along with a..."
4,positive,1/7/21,I bought this coffee because its much cheaper ...,1,I bought this coffee because its much cheaper ...
...,...,...,...,...,...
5439,negative,26/2/21,"This is an okay gift box, only if you like med...",0,"This is an okay gift box, only if you like med..."
5440,negative,18/12/19,It looks llike I just walked into a raw deal. ...,0,It looks llike I just walked into a raw deal. ...
5441,negative,19/1/20,Thank god that i tasted the metal before i swa...,0,Thank god that i tasted the metal before i swa...
5442,negative,13/9/20,This product was very good when I began buying...,0,This product was very good when I began buying...


In [7]:
SIA = SentimentIntensityAnalyzer()
reviews_csv['VADER_dict'] = reviews_csv['VADER_processed_Text'].apply(lambda text: SIA.polarity_scores(text))

reviews_csv.head()

Unnamed: 0,Sentiment,Time,Text,sentiment_label,VADER_processed_Text,VADER_dict
0,positive,18/6/21,This is a very healthy dog food. Good for thei...,1,This is a very healthy dog food. Good for thei...,"{'neg': 0.0, 'neu': 0.705, 'pos': 0.295, 'comp..."
1,positive,7/7/21,I've been very pleased with the Natural Balanc...,1,I've been very pleased with the Natural Balanc...,"{'neg': 0.031, 'neu': 0.732, 'pos': 0.237, 'co..."
2,positive,18/6/21,"Before I was educated about feline nutrition, ...",1,"Before I was educated about feline nutrition, ...","{'neg': 0.017, 'neu': 0.795, 'pos': 0.188, 'co..."
3,positive,7/7/21,"My holistic vet recommended this, along with a...",1,"My holistic vet recommended this, along with a...","{'neg': 0.079, 'neu': 0.67, 'pos': 0.252, 'com..."
4,positive,1/7/21,I bought this coffee because its much cheaper ...,1,I bought this coffee because its much cheaper ...,"{'neg': 0.0, 'neu': 0.843, 'pos': 0.157, 'comp..."


##### The compound score is the sum of positive, negative & neutral scores which is then normalized between -1(most extreme negative) and +1 (most extreme positive).

In [8]:
# Convert the score to appropriate label {0,1}
reviews_csv['VADER_score'] = reviews_csv['VADER_dict'].apply(lambda sent_dict: sent_dict['compound'])
reviews_csv['VADER_label'] = 0

# If compound > 0 -> 1 else compund < 0 -> 0
reviews_csv.loc[reviews_csv['VADER_score'] > 0, 'VADER_label'] = 1
reviews_csv.loc[reviews_csv['VADER_score'] < 0, 'VADER_label'] = 0
reviews_csv.head()

Unnamed: 0,Sentiment,Time,Text,sentiment_label,VADER_processed_Text,VADER_dict,VADER_score,VADER_label
0,positive,18/6/21,This is a very healthy dog food. Good for thei...,1,This is a very healthy dog food. Good for thei...,"{'neg': 0.0, 'neu': 0.705, 'pos': 0.295, 'comp...",0.8313,1
1,positive,7/7/21,I've been very pleased with the Natural Balanc...,1,I've been very pleased with the Natural Balanc...,"{'neg': 0.031, 'neu': 0.732, 'pos': 0.237, 'co...",0.9273,1
2,positive,18/6/21,"Before I was educated about feline nutrition, ...",1,"Before I was educated about feline nutrition, ...","{'neg': 0.017, 'neu': 0.795, 'pos': 0.188, 'co...",0.9769,1
3,positive,7/7/21,"My holistic vet recommended this, along with a...",1,"My holistic vet recommended this, along with a...","{'neg': 0.079, 'neu': 0.67, 'pos': 0.252, 'com...",0.9678,1
4,positive,1/7/21,I bought this coffee because its much cheaper ...,1,I bought this coffee because its much cheaper ...,"{'neg': 0.0, 'neu': 0.843, 'pos': 0.157, 'comp...",0.8868,1


In [9]:
print("F1 score: ", f1_score(reviews_csv['sentiment_label'], reviews_csv['VADER_label']))
print("PR_AUC score: ", average_precision_score(reviews_csv['sentiment_label'], reviews_csv['VADER_label']))
print("ROC_AUC score: ", roc_auc_score(reviews_csv['sentiment_label'], reviews_csv['VADER_label']))
print("Accuracy: ", accuracy_score(reviews_csv['sentiment_label'], reviews_csv['VADER_label']))

F1 score:  0.8872441309124551
PR_AUC score:  0.826482206621329
ROC_AUC score:  0.6997636186872852
Accuracy:  0.8209037472446731


#### Let's try to improve scores - 
Instead of calculating polarity score on entire review, we calculate it for each sentatence in text and average it

In [10]:
# Function to calculate Average Vader score: by avergaing score for each sentence in input text
def avg_pol_score(text):
    compound_list = []
    for sent in text:
        dict_ = SIA.polarity_scores(sent)
        compound_list.append(dict_['compound'])

    return np.average(compound_list)

# Tokenize Text
reviews_csv['VADER_processed_Text'] = reviews_csv['VADER_processed_Text'].apply(lambda para: tokenize.sent_tokenize(para))
avg_VADER_score = reviews_csv['VADER_processed_Text'].apply(lambda text: avg_pol_score(text))

In [13]:
reviews_csv['avg_VADER_score'] = pd.DataFrame(avg_VADER_score)
reviews_csv['avg_VADER_score_label'] = 0
reviews_csv.loc[reviews_csv['avg_VADER_score'] > 0, 'avg_VADER_score_label'] = 1
reviews_csv.loc[reviews_csv['avg_VADER_score'] < 0, 'avg_VADER_score_label'] = 0
reviews_csv.head()

Unnamed: 0,Sentiment,Time,Text,sentiment_label,VADER_processed_Text,VADER_dict,VADER_score,VADER_label,avg_VADER_score,avg_VADER_score_label
0,positive,18/6/21,This is a very healthy dog food. Good for thei...,1,"[This is a very healthy dog food., Good for th...","{'neg': 0.0, 'neu': 0.705, 'pos': 0.295, 'comp...",0.8313,1,0.3346,1
1,positive,7/7/21,I've been very pleased with the Natural Balanc...,1,[I've been very pleased with the Natural Balan...,"{'neg': 0.031, 'neu': 0.732, 'pos': 0.237, 'co...",0.9273,1,0.459625,1
2,positive,18/6/21,"Before I was educated about feline nutrition, ...",1,"[Before I was educated about feline nutrition,...","{'neg': 0.017, 'neu': 0.795, 'pos': 0.188, 'co...",0.9769,1,0.277511,1
3,positive,7/7/21,"My holistic vet recommended this, along with a...",1,"[My holistic vet recommended this, along with ...","{'neg': 0.079, 'neu': 0.67, 'pos': 0.252, 'com...",0.9678,1,0.26184,1
4,positive,1/7/21,I bought this coffee because its much cheaper ...,1,[I bought this coffee because its much cheaper...,"{'neg': 0.0, 'neu': 0.843, 'pos': 0.157, 'comp...",0.8868,1,0.274083,1


In [14]:
print("F1 score: ", f1_score(reviews_csv['sentiment_label'], reviews_csv['avg_VADER_score_label']))
print("PR_AUC score: ", average_precision_score(reviews_csv['sentiment_label'], reviews_csv['avg_VADER_score_label']))
print("ROC_AUC score: ", roc_auc_score(reviews_csv['sentiment_label'], reviews_csv['avg_VADER_score_label']))
print("Accuracy: ", accuracy_score(reviews_csv['sentiment_label'], reviews_csv['avg_VADER_score_label']))

F1 score:  0.8907621247113163
PR_AUC score:  0.8290624565744616
ROC_AUC score:  0.7051979320583601
Accuracy:  0.8262307127112417


#### Drawback of VADER:
The main drawback with the rule-based approach for sentiment analysis is that the method only cares about individual words and completely ignores the context in which it is used. 

For example, “the party was savage” will be negative when considered by any token-based algorithms.

## Flair

In [17]:
from flair.models import TextClassifier
from flair.data import Sentence

classifier = TextClassifier.load('en-sentiment')
sentence = Sentence('The food was great!')
classifier.predict(sentence)

# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

  from .autonotebook import tqdm as notebook_tqdm


2023-03-22 18:22:18,713 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /var/folders/vs/ybxh3hyn7_7bpjzkl7xwj4700000gn/T/tmpsa_klk_x


100%|████████████████████████████████████████| 253M/253M [02:15<00:00, 1.96MB/s]

2023-03-22 18:24:35,286 copying /var/folders/vs/ybxh3hyn7_7bpjzkl7xwj4700000gn/T/tmpsa_klk_x to cache at /Users/nnerella/.flair/models/sentiment-en-mix-distillbert_4.pt
2023-03-22 18:24:35,365 removing temp file /var/folders/vs/ybxh3hyn7_7bpjzkl7xwj4700000gn/T/tmpsa_klk_x



Downloading: 100%|███████████████████████████| 28.0/28.0 [00:00<00:00, 5.51kB/s]
Downloading: 100%|██████████████████████████████| 483/483 [00:00<00:00, 109kB/s]
Downloading: 100%|███████████████████████████| 232k/232k [00:36<00:00, 6.36kB/s]
Downloading: 100%|████████████████████████████| 466k/466k [00:01<00:00, 465kB/s]


Sentence above is:  ['Sentence[5]: "The food was great!"'/'POSITIVE' (0.9961)]


In [24]:
sentence.labels[0]

'Sentence[5]: "The food was great!"'/'POSITIVE' (0.9961)

In [34]:
sentence.to_dict()['all labels']

[{'value': 'POSITIVE', 'confidence': 0.9961493015289307}]

In [61]:
sentences = reviews_csv.apply(lambda row: Sentence(row))
flair_predictions = reviews_csv.apply(lambda row: classifier.predict(row))

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## Spacy

Refernce: https://www.section.io/engineering-education/sentiment-analysis-with-spacy-and-scikit-learn/

Using Spacy tokeiser and linear SVC (SVM)

### Results (vectorizer)
- F1 score:  0.9232273838630808

- PR_AUC score:  0.9089266836550011

- ROC_AUC score:  0.8446448118914558

- Accuracy : 0.884643644379133

### Results (tfvectorizer)
- F1 score:  0.9364161849710984

- PR_AUC score:  0.9131134755677366

- ROC_AUC score:  0.8539450941983062

- Accuracy : 0.9030124908155768