# Sentiment Analysis Using Various Approaches

In [None]:
import numpy as np 
import pandas as pd
import json
import time
import re
import seaborn as sns

# Text cleaning
from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.stem import WordNetLemmatizer
import spacy 
nlp = spacy.load('en_core_web_sm')

# NLTK Bing Liu Lexicon 
import nltk
# nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize 

# VADER 
import nltk
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, confusion_matrix

# Supervised learning 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.calibration import CalibrationDisplay
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline 
from sklearn.svm import LinearSVC

from scipy.sparse import csr_matrix
from scipy.sparse import hstack, vstack

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

## Loading a subset of reviews and meta data

In [None]:
n = 1 
total_rows = 0

def process_chunks(file, chunksize = 1000):
    """
    Processes chunks of records from a JSON Lines file and appends them to a DataFrame.
    
    This function reads a JSON Lines file in chunks of a specified size, appends each chunk to a list of DataFrames, 
    and prints the number of rows added after processing each chunk. It stops after processing a maximum of 10 chunks.
    
    Parameters:
    file (str): The path to the JSON Lines file to be processed.
    chunksize (int): The number of records per chunk. Default is 1000.
    
    Returns:
    pd.DataFrame: A DataFrame containing all the processed chunks concatenated together.
    """

    # Setting as global variables
    global n, total_rows  
    
    chunks = pd.read_json(file, lines=True, chunksize = chunksize)
    dfs = []  
    n_chunks = 0

    for chunk in chunks:
        dfs.append(chunk)
        n_chunks += 1  
        print(len(chunk), " rows added")
        n += 1 
        total_rows += len(chunk)
        if n_chunks >= 10:  
            break  
            
    print("Done")
    print(f"Total rows: {total_rows}")
    return pd.concat(dfs, ignore_index=True)

In [None]:
reviews = "../data/Home_and_Kitchen.jsonl"
meta = "../data/meta_Home_and_Kitchen.jsonl"

start = time.process_time()

reviews_subset = process_chunks(reviews)

end = time.process_time()
elapsed_time = end - start
print('Created a subset of the reviews dataset')
print('Execution time:', elapsed_time, 'seconds')

print('--------------')
start = time.process_time()

meta_subset = process_chunks(meta)

end = time.process_time()
elapsed_time = end - start
print('Created a subset of the meta dataset')
print('Execution time:', elapsed_time, 'seconds')

## Text Cleaning - spaCy

In [None]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS
print('Original stopwords count:', len(stop_words))

# Updating my stopwords list 
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Include/ exclude certain words
include_stopwords = {'would'}
exclude_stopwords = {'well', 'off', 'very', 'not', 'few', 'much'}

stop_words |= include_stopwords
stop_words -= exclude_stopwords

# Remove adjectives from my stopwords using spaCy
exclude_adjectives = {word for word in stop_words if nlp(word)[0].pos_ == "ADJ"}
print(exclude_adjectives)

stop_words -= exclude_adjectives
print('Stopwords count:', len(stop_words))

In [None]:
def clean_data(doc):
    """
    Cleans and preprocesses a text document using spaCy.
    
    This function takes a text document as input, converts it to lowercase, 
    lemmatizes the words, removes non-alphabetic characters, and filters out stopwords. 
    The resulting cleaned text is returned as a single string. 
    """
    doc = doc.lower()
    doc = nlp(doc)
    # Lemmatize words 
    lemmas = [token.lemma_ for token in doc]
    # Removing non-alphabetic characters and stopwords
    tokens = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in stop_words]
    cleaned_doc = " ".join(tokens)
    
    return cleaned_doc

cleaned_text = reviews_subset.copy()
cleaned_text['spacy_text'] = cleaned_text['text'].apply(clean_data)

In [None]:
# Remove observations that are empty after the cleaning step
cleaned_text = cleaned_text[cleaned_text['spacy_text'].str.len() != 0]
print('Record count:', len(cleaned_text))

In [None]:
i = 2
print(f'Title: {cleaned_text.loc[i,"title"]}\n')

print(f'Text: {cleaned_text.loc[i,"text"]}\n')

print(f'Text: {cleaned_text.loc[i,"spacy_text"]}\n')

## Export Subset Dataset with Cleaned Text Column

In [None]:
cleaned_text.to_csv('../data/cleaned_subset.csv', index = False)
meta_subset.to_csv('../data/meta_subset.csv', index = False)

## Feature Engineering 

Create new features: word count, average word length, exclamation mark count, and sentiment.

In [None]:
def word_count(text):
    """
    Counts the number of words in the text.
    """
    words = text.split()
    return len(words)


def avg_word_length(text):
    """
    Returns the average word length in the text. 
    """
    # Check for empty or white-space only string 
    if not text.strip():
        return 0
        
    words = text.split()
    if not words:  # Check if words list is empty
        return 0
        
    word_lengths = [len(word) for word in words]
    avg_word_length = sum(word_lengths)/len(words)
    
    return(avg_word_length) 


def exclamation_count(text):
    """
    Returns the number of exclamations in the text.
    """
    doc = nlp(text)
    exclamations = []
    for token in doc: 
        if token.text == '!':
            exclamations.append(token.text)
    return len(exclamations)

In [None]:
cleaned_text['word_count'] = cleaned_text['text'].apply(word_count)
cleaned_text['avg_word_length'] = cleaned_text['text'].apply(avg_word_length)
cleaned_text['exclamation_count'] = cleaned_text['text'].apply(exclamation_count)

In [None]:
cleaned_text[['text', 'word_count', 'avg_word_length', 'exclamation_count']].head(3)

In [None]:
cleaned_text['word_count'].describe()

In [None]:
sns.displot(cleaned_text['word_count'],bins=25);

In [None]:
sns.displot(cleaned_text['exclamation_count'],bins=25);

## Sentiment Analysis with Lexicons

Unsupervised learning approach that involves evaluating the sentiment scores of words in a document based on predefined lexicons. A lexicon is a dictionary that contains a collection of words that is categorized as positive, negative, and neutral by experts. Each word's sentiment is determined, and the scores are combined to calculate the overall sentiment of the sentence. 
- Disadvantages: words that are not in the lexicon will not be scored; some lexicons might be better suited for a specific use; it overlooks negation (lexicons only match words and not phrases, ie "not bad" is scored more negative instead of neutral)

### Bing Liu Lexicon

The Bing Liu lexicon has a total of 6, 786 words with 2,005 classified as positive and 4,781 as negative. CLassification is binary (positive or negative).

In [None]:
print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))
print('Examples of positive words:', opinion_lexicon.positive()[:10])
print('Examples of negative words:', opinion_lexicon.negative()[:10])

In [None]:
pos_score = 1
neg_score = -1
word_dict = {}

# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
    word_dict[word] = pos_score 

# Adding the negative words to the dictionary 
for word in opinion_lexicon.negative():
    word_dict[word] = neg_score 

def bing_liu_score(text):
    sentiment_score = 0 
    bag_of_words = word_tokenize(text.lower())

    # Check if bag_of_words is empty
    if bag_of_words: 
        for word in bag_of_words: 
            if word in word_dict: 
                sentiment_score += word_dict[word]
        return sentiment_score / len(bag_of_words)
    else: 
        return 0

In [None]:
cleaned_text['Bing_Liu_score'] = cleaned_text['text'].apply(bing_liu_score)
cleaned_text['Bing_Liu_spaCy'] = cleaned_text['spacy_text'].apply(bing_liu_score)

### VADER Lexicon
Rule-based lexicon. 
9,000 features with scales of [-4] Extremely Negative to [4] Extremely Positive with [0] for Neutral or Neither. 

In [None]:
model = SentimentIntensityAnalyzer()

In [None]:
def vader_score(text):
    score = model.polarity_scores(text)
    compound_score = score['compound']
    return compound_score

In [None]:
cleaned_text['Vader_score'] = cleaned_text['text'].apply(vader_score)
cleaned_text['Vader_spaCy'] = cleaned_text['spacy_text'].apply(vader_score)

### Lexicon Sentiment Accuracy Scores 

In [None]:
cleaned_text[['Bing_Liu_score', 'Bing_Liu_spaCy', 'Vader_score', 'Vader_spaCy']].sample(3)

In [None]:
# Calculate mean sentiment score for each rating category
mean_scores = cleaned_text.groupby('rating').agg({
    'Bing_Liu_score':'mean',
    'Bing_Liu_spaCy': 'mean',
    'Vader_score': 'mean',
    'Vader_spaCy': 'mean'
}).reset_index()

print(mean_scores)

## Sentiment Analysis - Supervised Learning Approach

Using supervised learning models, we will classify the sentiment of a review based on pre-processed text from spaCy. Reviews will be classified as positive or negative sentiment based on their ratings:
- Positive (1): records with ratings of 4 and 5.
- Negative (0): records with ratings of 1 and 2.
- Neutral: records with ratings of 3 are removed.

### Feature Engineering: Sentiment Classification
Create a new column in our dataset to classify records based on our ratings. 

In [None]:
cleaned_text['sentiment'] = 0

# Classify records with rating higher than a 3, positive (1)
cleaned_text.loc[cleaned_text['rating'] > 3, 'sentiment'] = 1

# Classify records with rating lower than a 3, negative (0)
cleaned_text.loc[cleaned_text['rating'] < 3, 'sentiment'] = 0

# Remove records with a rating of 3
cleaned_text = cleaned_text.loc[cleaned_text['rating'] != 3].reset_index(drop=True)

### Text Vectorization with TF-IDF
Use TF-IDF vectorizer to transform the text into vectors based on the frequency of words in the text:

In [None]:
tfidf = TfidfVectorizer()
clf = MultinomialNB()

pipe = Pipeline([("tfidf", tfidf), ("clf", clf)])

param_grid = {
    'tfidf__ngram_range':[(1,1), (1,2), (1,3)],
    'tfidf__min_df':[1, 2, 5, 10, 20],
    'clf__fit_prior':[False, True]
}

### Model training and Evaluation
The following models will be used in our analysis:
- Logistic Regression: a linear model for binary classification.
- Linear Support Vector Classification (SVC): A classifier that constructs a hyperplane to separate classes.
- Randomized Search Cross-Validation: A technique to tune hyperparameters for improving model performance.

#### Randomized Search Cross Validation

In [None]:
X = cleaned_text[['spacy_text']]
y = cleaned_text['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, stratify = y)

In [None]:
rs = RandomizedSearchCV(estimator = pipe, param_distributions = param_grid, verbose = 2, n_jobs = -1)
rs.fit(X_train['spacy_text'], y_train)

In [None]:
y_pred = rs.predict(X_test['spacy_text'])

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC-AUC Score: {roc_auc_score(y_test, y_pred)}')
print(f'Classification Report: \n {classification_report(y_test,y_pred, zero_division = 0.0)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}')

In [None]:
# Finding the best parameters 
print(rs.best_params_)
print(rs.best_score_)

In [None]:
sentiment_prob = rs.predict_proba(X_test['spacy_text'])
positive_class_prob = sentiment_prob[:, 1]

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, positive_class_prob, pos_label=1)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Randomized Search Cross Validation ROC Curve')
plt.show()

print(roc_auc_score(y_test, positive_class_prob))

#### Compare trained model to baseline 

In [None]:
dummy_clf = DummyClassifier(strategy = 'most_frequent')
dummy_clf.fit(X_train, y_train)
y_pred_baseline = dummy_clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred_baseline)}')
print(f'ROC-AUC Score: {roc_auc_score(y_test, y_pred_baseline)}')
print(f'Classification Report: \n {classification_report(y_test,y_pred_baseline, zero_division = 0.0)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred_baseline)}')

The trained randomized search cross validation model performs slightly beter than the baseline model.  

#### Linear Support Vector Classification (SVC)

In [None]:
tfidf = TfidfVectorizer(min_df = 5, ngram_range = (1,2))

X_train_tf = tfidf.fit_transform(X_train['spacy_text'])
X_test_tf = tfidf.transform(X_test['spacy_text'])

linear = LinearSVC(random_state = 42, tol = 1e-5, max_iter = 10000)
linear.fit(X_train_tf, y_train)

In [None]:
y_pred = linear.predict(X_test_tf)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC-AUC Score: {roc_auc_score(y_test, y_pred)}')
print(f'Classification Report: \n {classification_report(y_test,y_pred, zero_division = 0.0)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}')

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

### Logistic Regression

In [None]:
tfidf_log = TfidfVectorizer(min_df = 10, ngram_range = (1,3))

X_train_tf = tfidf_log.fit_transform(X_train['spacy_text'])
X_test_tf = tfidf_log.transform(X_test['spacy_text'])

logreg = LogisticRegression(max_iter = 1000).fit(X_train_tf, y_train)

In [None]:
y_pred = logreg.predict(X_test_tf)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC-AUC Score: {roc_auc_score(y_test, y_pred)}')
print(f'Classification Report: \n {classification_report(y_test,y_pred, zero_division = 0.0)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}')

In [None]:
features = None
features = tfidf_log.get_feature_names_out(features)

sentiment = 0
idx = list(logreg.classes_).index(sentiment)


# # Extract coefficients from the Logistic Regression model 
intercept = logreg.intercept_[idx]
coefficients = logreg.coef_[idx]

coefficients_df = pd.DataFrame({
    'variable': ['intercept'] + list(features),
    'coefficient': [intercept] + list(coefficients)
})

coefficients_df.sort_values(by = 'coefficient', ascending = False)


### Train-Test Split with Other Features

In [None]:
cleaned_text.columns

In [None]:
variables = ['rating', 'helpful_vote', 'verified_purchase', 'word_count', 'avg_word_length', 
            'exclamation_count', 'spacy_text', 'Bing_Liu_score', 'Bing_Liu_spaCy', 'Vader_score', 'Vader_spaCy']
X = cleaned_text[variables]
y = cleaned_text['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, stratify = y)

print ('Size of Training Data: ', X_train.shape[0])
print ('Size of Test Data: ', X_test.shape[0])

### Model Testing

In [None]:
variables = ['helpful_vote', 'verified_purchase', 'word_count', 'avg_word_length', 
            'exclamation_count', 'Bing_Liu_score', 'Bing_Liu_spaCy', 'Vader_score', 'Vader_spaCy']
X_train[variables].astype(float)

In [None]:
m = csr_matrix(X_train[variables].astype(float))
n = csr_matrix(X_test[variables].astype(float))

X_train_stack = hstack((m, X_train_tf))
X_test_stack = hstack((n, X_test_tf))

In [None]:
linear.fit(X_train_stack, y_train)

y_pred = linear.predict(X_test_stack)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC-AUC Score: {roc_auc_score(y_test, y_pred)}')
print(f'Classification Report: \n {classification_report(y_test,y_pred, zero_division = 0.0)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}')