In [63]:
import pandas as pd
import time

file_name = 'selected_1000_reviews.csv'

# Determine the file type based on its extension and load data accordingly
if file_name.endswith('.csv'):
    df = pd.read_csv(file_name)
elif file_name.endswith('.json'):
    df = pd.read_json(file_name, lines=True)
else:
    raise ValueError("Unsupported file format. Please use a .csv or .json file.")

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,review_text,human_rating
0,"I bought both boxed sets, books 1-5. Really a...",5
1,I enjoyed this short book. But it was way way ...,3
2,I really enjoyed this adventure and look forwa...,4
3,It was a decent read.. typical story line. Not...,3
4,"This is the First book in the Trilogy, and I'm...",5


In [64]:
rating_df = df.copy()

In [65]:
# Filter rows where review_text has less than or equal to 512 characters
df = df[df['review_text'].str.len() <= 512]

# Select only 'review_text' and 'review_rating' columns
df = df[['review_text']]

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,review_text
0,"I bought both boxed sets, books 1-5. Really a..."
1,I enjoyed this short book. But it was way way ...
2,I really enjoyed this adventure and look forwa...
3,It was a decent read.. typical story line. Not...
4,"This is the First book in the Trilogy, and I'm..."


In [66]:
custom_slang_words = {'lol', 'rofl', 'brb', 'omg', 'btw', 'afk', 'imho', 'fyi', 'ttyl', 
                      'gr8', 'luv', 'xoxo', 'bff', 'smh', 'gtg', 'thx', 'ty', 'pls',
                      'thnx', 'yw', 'np', 'idc', 'ily', 'wtf', 'wth', 'jk', 'nvm',
                      'afaik', 'icymi', 'idk', 'tmi', 'fomo', 'yolo', 'ootd', 'tbh', 'tbt',
                      'ftw', 'fml', 'imo', 'irl', 'yass', 'baka', 'omw', 'rn', 'srs', 'imy',
                      'nbd', 'tfw', 'tldr', 'rip', 'bruh', 'oomf', 'fwiw',
                      'wbu', 'wb', 'lmao', 'lmfao', 'af', 'afaict', 'fud', 'ily2', 'tty', 
                      'ttys', 'stfu', 'omfg', 'otw', 'gtfo', 'gth', 'lms',
                      'lmk', 'smfh', 'bfn', 'ttyt', 'g2g', 'bbs', 'bbiab',
                      'cya', 'cys', 'cu', 'cul', 'cul8r', 'iow', 'l8', 'lolz', 'lmao', 
                      'lmfao', 'l8r', 'rofl', 'roflmao', 'rotfl', 'rotflmao'}

In [67]:
start_time = time.time()
import nltk
from nltk.tokenize import word_tokenize
import re
import string

def word_char_and_slang_count(text):
    tokens = word_tokenize(text)
    word_count = len(tokens)
    character_count = sum(len(word) for word in tokens)
    
    # Count the number of slang words
    slang_count = sum(1 for word in tokens if word.lower() in custom_slang_words)
    
    return word_count, character_count, slang_count

# Assuming 'df' is your DataFrame and 'review_text' is the column containing the text
df[['Word_Count', 'Character_Count', 'Slang_Count']] = pd.DataFrame(df['review_text'].apply(word_char_and_slang_count).tolist(), index=df.index)

In [68]:
# Define function to count emoticons
def count_emoticons(text):
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    return len(emoticons)

# Define function to count punctuations representing emotions
def count_emotion_punctuations(text):
    emotion_punctuations = re.findall(r'[!]+', text)
    return len(emotion_punctuations)

# Define function to count capital letters
def count_capital_letters(text):
    return sum(1 for char in text if char.isupper())

# Define function to count punctuation marks
def count_punctuation(text):
    punctuation_count = sum(1 for char in text if char in string.punctuation)
    return punctuation_count

df['Punctuation_Count'] = df['review_text'].apply(count_punctuation)
df['Emoticon_Count'] = df['review_text'].apply(count_emoticons)
df['Emotion_Punctuation_Count'] = df['review_text'].apply(count_emotion_punctuations)
df['Capital_Letter_Count'] = df['review_text'].apply(count_capital_letters)

df.head()

Unnamed: 0,review_text,Word_Count,Character_Count,Slang_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count
0,"I bought both boxed sets, books 1-5. Really a...",87,343,0,10,0,3,9
1,I enjoyed this short book. But it was way way ...,28,99,0,6,0,0,3
2,I really enjoyed this adventure and look forwa...,34,138,0,3,0,0,5
3,It was a decent read.. typical story line. Not...,25,89,0,6,0,0,3
4,"This is the First book in the Trilogy, and I'm...",34,147,0,4,0,0,5


In [69]:
from afinn import Afinn

# Initialize AFINN lexicon
afinn = Afinn()

def count_sentiment_words(text):
    # Tokenize text
    words = text.lower().split()
    
    # Initialize counters
    positive_count = 0
    negative_count = 0
    neutral_count = 0
    
    # Count positive, negative, and neutral words
    for word in words:
        sentiment_score = afinn.score(word)
        if sentiment_score > 0:
            positive_count += 1
        elif sentiment_score < 0:
            negative_count += 1
        else:
            neutral_count += 1
    
    return positive_count, negative_count, neutral_count

# Assuming you have a DataFrame 'df' with a 'review_text' column
df[['Positive_Words_Count', 'Negative_Words_Count', 'Neutral_Words_Count']] = pd.DataFrame(df['review_text'].apply(count_sentiment_words).tolist(), index=df.index)

df.head()

Unnamed: 0,review_text,Word_Count,Character_Count,Slang_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count,Positive_Words_Count,Negative_Words_Count,Neutral_Words_Count
0,"I bought both boxed sets, books 1-5. Really a...",87,343,0,10,0,3,9,3,1,74
1,I enjoyed this short book. But it was way way ...,28,99,0,6,0,0,3,1,0,24
2,I really enjoyed this adventure and look forwa...,34,138,0,3,0,0,5,4,0,27
3,It was a decent read.. typical story line. Not...,25,89,0,6,0,0,3,0,0,20
4,"This is the First book in the Trilogy, and I'm...",34,147,0,4,0,0,5,2,0,28


In [70]:
# Reorder the columns
df = df[['review_text', 'Word_Count', 'Slang_Count', 'Character_Count', 'Punctuation_Count', 'Emoticon_Count', 'Emotion_Punctuation_Count', 'Capital_Letter_Count', 'Positive_Words_Count',  'Negative_Words_Count',  'Neutral_Words_Count']]

df.head()

Unnamed: 0,review_text,Word_Count,Slang_Count,Character_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count,Positive_Words_Count,Negative_Words_Count,Neutral_Words_Count
0,"I bought both boxed sets, books 1-5. Really a...",87,0,343,10,0,3,9,3,1,74
1,I enjoyed this short book. But it was way way ...,28,0,99,6,0,0,3,1,0,24
2,I really enjoyed this adventure and look forwa...,34,0,138,3,0,0,5,4,0,27
3,It was a decent read.. typical story line. Not...,25,0,89,6,0,0,3,0,0,20
4,"This is the First book in the Trilogy, and I'm...",34,0,147,4,0,0,5,2,0,28


In [71]:
from joblib import load
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import pandas as pd

# Load the logistic regression model
model = load('final_model.joblib')

# Load the TF-IDF vectorizer model
tfidf_vectorizer = load('tfidf_vectorizer_model.joblib')

# Assuming you have your new dataset loaded into a DataFrame named 'df_new'
# Preprocess and vectorize the text data in the new dataset
X_text_new = df['review_text']
X_text_new_tfidf = tfidf_vectorizer.fit_transform(X_text_new)


# Combine text features with other features in the new dataset
X_new = sp.hstack([df.drop(columns=['review_text']).values, X_text_new_tfidf])

# Use the trained logistic regression model to predict the target variable for the new dataset
y_pred_new = model.predict(X_new)

# Add the predicted target column to the new dataset
df['Predicted_Target'] = y_pred_new

# Optionally, save the new dataset with the predicted target column to a CSV file
df.to_csv('new_dataset_predicted.csv', index=False)

end_time = time.time()
df

Unnamed: 0,review_text,Word_Count,Slang_Count,Character_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count,Positive_Words_Count,Negative_Words_Count,Neutral_Words_Count,Predicted_Target
0,"I bought both boxed sets, books 1-5. Really a...",87,0,343,10,0,3,9,3,1,74,2
1,I enjoyed this short book. But it was way way ...,28,0,99,6,0,0,3,1,0,24,1
2,I really enjoyed this adventure and look forwa...,34,0,138,3,0,0,5,4,0,27,2
3,It was a decent read.. typical story line. Not...,25,0,89,6,0,0,3,0,0,20,2
4,"This is the First book in the Trilogy, and I'm...",34,0,147,4,0,0,5,2,0,28,2
...,...,...,...,...,...,...,...,...,...,...,...,...
995,"Exciting, riveting and well plotted. With each...",29,0,136,4,0,0,3,4,0,21,1
996,Found this book in a give-away area of my loca...,88,0,313,12,0,0,8,2,2,74,2
997,I love this series and i can't wait for the la...,57,0,204,5,0,0,6,6,0,46,2
998,this is a beautiful coming to age story. I lik...,32,0,127,4,0,0,4,4,0,24,2


In [72]:
pre_time = end_time - start_time
print(f"Elapsed time: {pre_time} seconds")

Elapsed time: 1.0396702289581299 seconds


In [73]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import time

In [74]:
# Check for CUDA
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cpu')

In [75]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = model.to(device)

In [76]:
def bert_sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    tokens = tokens.to(device)
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [77]:
# import SentimentIntensityAnalyzer class
# from vaderSentiment.vaderSentiment module.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# function to print sentiments
# of the sentence.
def vader_sentiment_score(sentence):

    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()

    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
    # Example usage:
    vader_score = sentiment_dict['compound'] # Example sentiment score from VADER
    converted_score = convert_sentiment_score(vader_score)
        
    return converted_score

def convert_sentiment_score(score):
    scaled_score = (((score - (-1)) * (5 - 1)) / (1 - (-1))) + 1
    return round(scaled_score)

In [78]:
vader_start_time = time.time()
# Apply BERT sentiment analysis only to rows where Predicted_Target == 2
df.loc[df['Predicted_Target'] == 1, 'Hybrid_Sentiment'] = df.loc[df['Predicted_Target'] == 1, 'review_text'].apply(vader_sentiment_score)
vader_end_time = time.time()
vader_time = vader_end_time - vader_start_time
print(f"Elapsed time for VADER sentiment analysis: {vader_time} seconds")

Elapsed time for VADER sentiment analysis: 2.129478931427002 seconds


In [79]:
df.head()

Unnamed: 0,review_text,Word_Count,Slang_Count,Character_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count,Positive_Words_Count,Negative_Words_Count,Neutral_Words_Count,Predicted_Target,Hybrid_Sentiment
0,"I bought both boxed sets, books 1-5. Really a...",87,0,343,10,0,3,9,3,1,74,2,
1,I enjoyed this short book. But it was way way ...,28,0,99,6,0,0,3,1,0,24,1,4.0
2,I really enjoyed this adventure and look forwa...,34,0,138,3,0,0,5,4,0,27,2,
3,It was a decent read.. typical story line. Not...,25,0,89,6,0,0,3,0,0,20,2,
4,"This is the First book in the Trilogy, and I'm...",34,0,147,4,0,0,5,2,0,28,2,


In [80]:
# Measure the start time
bert_start_time = time.time()

# Apply BERT sentiment analysis only to rows where Predicted_Target == 2
df.loc[df['Predicted_Target'] == 2, 'Hybrid_Sentiment'] = df.loc[df['Predicted_Target'] == 2, 'review_text'].apply(lambda x: bert_sentiment_score(x[:512]))

# Measure the end time
bert_end_time = time.time()

# Calculate the elapsed time
bert_time = bert_end_time - bert_start_time
print(f"Elapsed time for BERT sentiment analysis: {bert_time} seconds")

Elapsed time for BERT sentiment analysis: 83.41846203804016 seconds


In [81]:
elapsed_time = vader_time + bert_time + pre_time
print(f"Elapsed time for sentiment analysis: {elapsed_time} seconds")

Elapsed time for sentiment analysis: 86.5876111984253 seconds


In [82]:
df

Unnamed: 0,review_text,Word_Count,Slang_Count,Character_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count,Positive_Words_Count,Negative_Words_Count,Neutral_Words_Count,Predicted_Target,Hybrid_Sentiment
0,"I bought both boxed sets, books 1-5. Really a...",87,0,343,10,0,3,9,3,1,74,2,5.0
1,I enjoyed this short book. But it was way way ...,28,0,99,6,0,0,3,1,0,24,1,4.0
2,I really enjoyed this adventure and look forwa...,34,0,138,3,0,0,5,4,0,27,2,5.0
3,It was a decent read.. typical story line. Not...,25,0,89,6,0,0,3,0,0,20,2,3.0
4,"This is the First book in the Trilogy, and I'm...",34,0,147,4,0,0,5,2,0,28,2,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"Exciting, riveting and well plotted. With each...",29,0,136,4,0,0,3,4,0,21,1,5.0
996,Found this book in a give-away area of my loca...,88,0,313,12,0,0,8,2,2,74,2,3.0
997,I love this series and i can't wait for the la...,57,0,204,5,0,0,6,6,0,46,2,5.0
998,this is a beautiful coming to age story. I lik...,32,0,127,4,0,0,4,4,0,24,2,4.0


In [83]:
sentence = "Its raining cats and dogs"
print(f"VADER score: {vader_sentiment_score(sentence)}")
print(f"BERT score: {bert_sentiment_score(sentence)}")

VADER score: 3
BERT score: 3


In [84]:
sentence = "omg, this book"
print(f"VADER score: {vader_sentiment_score(sentence)}")
print(f"BERT score: {bert_sentiment_score(sentence)}")

VADER score: 3
BERT score: 5


In [85]:
# Find Time Taken to Process
start_time = time.time()
df['BERT_Sentiment'] = df['review_text'].apply(lambda x: bert_sentiment_score(x[:512]))
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 110.93239307403564 seconds


In [86]:
# Apply your existing function
start_time = time.time()
df['VADER_Sentiment'] = df['review_text'].apply(vader_sentiment_score)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 10.007499933242798 seconds


In [87]:
# Merge the DataFrames based on the 'review_text' column
merged_df = pd.merge(df, rating_df, on='review_text', how='left')

In [88]:
merged_df

Unnamed: 0,review_text,Word_Count,Slang_Count,Character_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count,Positive_Words_Count,Negative_Words_Count,Neutral_Words_Count,Predicted_Target,Hybrid_Sentiment,BERT_Sentiment,VADER_Sentiment,human_rating
0,"I bought both boxed sets, books 1-5. Really a...",87,0,343,10,0,3,9,3,1,74,2,5.0,5,5,5
1,I enjoyed this short book. But it was way way ...,28,0,99,6,0,0,3,1,0,24,1,4.0,3,4,3
2,I really enjoyed this adventure and look forwa...,34,0,138,3,0,0,5,4,0,27,2,5.0,5,5,4
3,It was a decent read.. typical story line. Not...,25,0,89,6,0,0,3,0,0,20,2,3.0,3,4,3
4,"This is the First book in the Trilogy, and I'm...",34,0,147,4,0,0,5,2,0,28,2,4.0,4,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"Exciting, riveting and well plotted. With each...",29,0,136,4,0,0,3,4,0,21,1,5.0,5,5,5
996,Found this book in a give-away area of my loca...,88,0,313,12,0,0,8,2,2,74,2,3.0,3,3,2
997,I love this series and i can't wait for the la...,57,0,204,5,0,0,6,6,0,46,2,5.0,5,5,5
998,this is a beautiful coming to age story. I lik...,32,0,127,4,0,0,4,4,0,24,2,4.0,4,5,3


In [89]:
from sklearn.metrics import classification_report

# Define true labels (y_true) and predicted labels for each approach
y_true = merged_df['human_rating']
y_pred_hybrid = merged_df['Hybrid_Sentiment']
y_pred_bert = merged_df['BERT_Sentiment']
y_pred_vader = merged_df['VADER_Sentiment']

# Calculate classification report for each approach
report_hybrid = classification_report(y_true, y_pred_hybrid)
report_bert = classification_report(y_true, y_pred_bert)
report_vader = classification_report(y_true, y_pred_vader)

# Print the reports
print("Hybrid Approach:")
print(report_hybrid)

print("\nBERT Approach:")
print(report_bert)

print("\nVADER Approach:")
print(report_vader)

Hybrid Approach:
              precision    recall  f1-score   support

           1       0.46      0.46      0.46        26
           2       0.27      0.44      0.34        32
           3       0.37      0.43      0.40        99
           4       0.35      0.43      0.38       216
           5       0.82      0.71      0.76       627

    accuracy                           0.61      1000
   macro avg       0.45      0.49      0.47      1000
weighted avg       0.65      0.61      0.62      1000


BERT Approach:
              precision    recall  f1-score   support

           1       0.41      0.46      0.44        26
           2       0.27      0.47      0.34        32
           3       0.36      0.45      0.40        99
           4       0.36      0.51      0.42       216
           5       0.86      0.66      0.75       627

    accuracy                           0.60      1000
   macro avg       0.45      0.51      0.47      1000
weighted avg       0.67      0.60      0.62 

In [90]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate MAE and MSE between human_rating and final_sentiment columns
mae_final_sentiment = mean_absolute_error(merged_df['human_rating'], merged_df['Hybrid_Sentiment'])
mse_final_sentiment = mean_squared_error(merged_df['human_rating'], merged_df['Hybrid_Sentiment'])

# Calculate MAE and MSE between human_rating and BERT_Sentiment columns
mae_bert_sentiment = mean_absolute_error(merged_df['human_rating'], merged_df['BERT_Sentiment'])
mse_bert_sentiment = mean_squared_error(merged_df['human_rating'], merged_df['BERT_Sentiment'])

# Calculate MAE and MSE between human_rating and VADER_Sentiment columns
mae_vader_sentiment = mean_absolute_error(merged_df['human_rating'], merged_df['VADER_Sentiment'])
mse_vader_sentiment = mean_squared_error(merged_df['human_rating'], merged_df['VADER_Sentiment'])

# Print the results
print("human_rating vs. final_sentiment:")
print("MAE:", mae_final_sentiment)
print("MSE:", mse_final_sentiment)
print()

print("human_rating vs. BERT_Sentiment:")
print("MAE:", mae_bert_sentiment)
print("MSE:", mse_bert_sentiment)
print()

print("human_rating vs. VADER_Sentiment:")
print("MAE:", mae_vader_sentiment)
print("MSE:", mse_vader_sentiment)

human_rating vs. final_sentiment:
MAE: 0.491
MSE: 0.755

human_rating vs. BERT_Sentiment:
MAE: 0.502
MSE: 0.778

human_rating vs. VADER_Sentiment:
MAE: 0.647
MSE: 1.047


In [91]:
# Calculate Exact Match Accuracy for final sentiment
exact_match_accuracy_final = (merged_df['Hybrid_Sentiment'] == merged_df['human_rating']).mean()

# Calculate Off-by-1 Accuracy for final sentiment
off_by_1_accuracy_final = ((merged_df['Hybrid_Sentiment'] - merged_df['human_rating']).abs() <= 1).mean()

print("Final Sentiment:")
print("Exact Match Accuracy:", exact_match_accuracy_final)
print("Accuracy (Off-by-1):", off_by_1_accuracy_final)

# Calculate Exact Match Accuracy for BERT sentiment
exact_match_accuracy_bert = (merged_df['BERT_Sentiment'] == merged_df['human_rating']).mean()

# Calculate Off-by-1 Accuracy for BERT sentiment
off_by_1_accuracy_bert = ((merged_df['BERT_Sentiment'] - merged_df['human_rating']).abs() <= 1).mean()

print("\nBERT Sentiment:")
print("Exact Match Accuracy:", exact_match_accuracy_bert)
print("Accuracy (Off-by-1):", off_by_1_accuracy_bert)

# Calculate Exact Match Accuracy for VADER sentiment
exact_match_accuracy_vader = (merged_df['VADER_Sentiment'] == merged_df['human_rating']).mean()

# Calculate Off-by-1 Accuracy for VADER sentiment
off_by_1_accuracy_vader = ((merged_df['VADER_Sentiment'] - merged_df['human_rating']).abs() <= 1).mean()

print("\nVADER Sentiment:")
print("Exact Match Accuracy:", exact_match_accuracy_vader)
print("Accuracy (Off-by-1):", off_by_1_accuracy_vader)

Final Sentiment:
Exact Match Accuracy: 0.606
Accuracy (Off-by-1): 0.929

BERT Sentiment:
Exact Match Accuracy: 0.598
Accuracy (Off-by-1): 0.928

VADER Sentiment:
Exact Match Accuracy: 0.514
Accuracy (Off-by-1): 0.875
