In [19]:
import pandas as pd
import re
import string
from afinn import Afinn
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from joblib import load
import nltk
import torch
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

custom_slang_words = {'lol', 'rofl', 'brb', 'omg', 'btw', 'afk', 'imho', 'fyi', 'ttyl', 
                      'gr8', 'luv', 'xoxo', 'bff', 'smh', 'gtg', 'thx', 'ty', 'pls',
                      'thnx', 'yw', 'np', 'idc', 'ily', 'wtf', 'wth', 'jk', 'nvm',
                      'afaik', 'icymi', 'idk', 'tmi', 'fomo', 'yolo', 'ootd', 'tbh', 'tbt',
                      'ftw', 'fml', 'imo', 'irl', 'yass', 'baka', 'omw', 'rn', 'srs', 'imy',
                      'nbd', 'tfw', 'tldr', 'rip', 'bruh', 'oomf', 'fwiw',
                      'wbu', 'wb', 'lmao', 'lmfao', 'af', 'afaict', 'fud', 'ily2', 'tty', 
                      'ttys', 'stfu', 'omfg', 'otw', 'gtfo', 'gth', 'lms',
                      'lmk', 'smfh', 'bfn', 'ttyt', 'g2g', 'bbs', 'bbiab',
                      'cya', 'cys', 'cu', 'cul', 'cul8r', 'iow', 'l8', 'lolz', 'lmao', 
                      'lmfao', 'l8r', 'rofl', 'roflmao', 'rotfl', 'rotflmao'}

# Load the trained model
model = load('final_model.joblib')

# Load AFINN lexicon for sentiment analysis
afinn = Afinn()

# Load BERT tokenizer and model for sentiment analysis
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model_bert = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Load the TF-IDF vectorizer model
tfidf_vectorizer = load('tfidf_vectorizer_model.joblib')

def predict_target(sentence):
    # Define function to count emoticons
    def count_emoticons(text):
        emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
        return len(emoticons)

    # Define function to count punctuations representing emotions
    def count_emotion_punctuations(text):
        emotion_punctuations = re.findall(r'[!]+', text)
        return len(emotion_punctuations)

    # Define function to count capital letters
    def count_capital_letters(text):
        return sum(1 for char in text if char.isupper())

    # Define function to count punctuation marks
    def count_punctuation(text):
        punctuation_count = sum(1 for char in text if char in string.punctuation)
        return punctuation_count
    
    def count_sentiment_words(sentence):
        # Tokenize the sentence using NLTK word_tokenize
        tokens = word_tokenize(sentence)

        # Initialize counters
        positive_count = 0
        negative_count = 0
        neutral_count = 0

        # Count positive, negative, and neutral words
        for word in tokens:
            sentiment_score = afinn.score(word)
            if sentiment_score > 0:
                positive_count += 1
            elif sentiment_score < 0:
                negative_count += 1
            else:
                neutral_count += 1

        return positive_count, negative_count, neutral_count

    # Tokenize the sentence using NLTK word_tokenize
    tokens = word_tokenize(sentence)

    # Count the number of words and characters
    word_count = len(tokens)
    character_count = sum(len(word) for word in tokens)

    # Count the number of slang words
    slang_count = sum(1 for word in tokens if word.lower() in custom_slang_words)

    # Count emoticons, emotion punctuations, capital letters, and punctuation marks
    emoticon_count = count_emoticons(sentence)
    emotion_punctuation_count = count_emotion_punctuations(sentence)
    capital_letter_count = count_capital_letters(sentence)
    punctuation_count = count_punctuation(sentence)

    # Count positive, negative, and neutral words for sentiment analysis
    positive_count, negative_count, neutral_count = count_sentiment_words(sentence)

    # Combine all features
    features = [word_count, slang_count, character_count, punctuation_count, emoticon_count,
                emotion_punctuation_count, capital_letter_count, positive_count, negative_count, neutral_count]
    
    # Pad the sentence to generate more features
    sentence = sentence + " " * (2000 - len(sentence.split()))
    
    # Vectorize the text data
    X_text = tfidf_vectorizer.transform([sentence])
    # After obtaining TF-IDF vectorized text features, convert them to an array for concatenation
    X_text_array = X_text.toarray()

    # Combine text features with other features
    X_combined = hstack((features, X_text_array))

    # Predict the target variable
    predicted_target = model.predict(X_combined)

    return predicted_target[0]

# Function to get sentiment score using VADER
def vader_sentiment_score(sentence):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
    vader_score = sentiment_dict['compound']
    converted_score = convert_sentiment_score(vader_score)
    return converted_score

# Function to convert VADER sentiment score to 1-5 scale
def convert_sentiment_score(score):
    scaled_score = (((score - (-1)) * (5 - 1)) / (1 - (-1))) + 1
    return round(scaled_score)

# Function to get sentiment score using BERT
def bert_sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model_bert(tokens)
    return int(torch.argmax(result.logits)) + 1

# Function to predict sentiment score based on target and select VADER or BERT
def hybrid_sentiment_score(sentence):
    predicted_target = predict_target(sentence)
    if predicted_target == 1:
        return vader_sentiment_score(sentence)
    else:
        return bert_sentiment_score(sentence)

In [20]:
sentence = "This is a great product!"
print("Predicted Sentiment:", hybrid_sentiment_score(sentence))

Predicted Sentiment: 5


In [21]:
sentence = "The laptop is lighter than I expected."
print(f"VADER score: {vader_sentiment_score(sentence)}")
print(f"BERT score: {bert_sentiment_score(sentence)}")
print(f"Hybrid score: {hybrid_sentiment_score(sentence)}")

VADER score: 3
BERT score: 4
Hybrid score: 4


In [22]:
sentence = "Wow, what a surprise, my package arrived two days late AGAIN. Fantastic service"
print(f"VADER score: {vader_sentiment_score(sentence)}")
print(f"BERT score: {bert_sentiment_score(sentence)}")
print(f"Hybrid score: {hybrid_sentiment_score(sentence)}")

VADER score: 5
BERT score: 5
Hybrid score: 5


In [27]:
sentence = "the service is 😞😞😞😞😞"
print(f"VADER score: {vader_sentiment_score(sentence)}")
print(f"BERT score: {bert_sentiment_score(sentence)}")
print(f"Hybrid score: {hybrid_sentiment_score(sentence)}")

VADER score: 1
BERT score: 5
Hybrid score: 5


In [26]:
sentence = "The book is muy muy interesante"
print(f"VADER score: {vader_sentiment_score(sentence)}")
print(f"BERT score: {bert_sentiment_score(sentence)}")
print(f"Hybrid score: {hybrid_sentiment_score(sentence)}")

VADER score: 3
BERT score: 5
Hybrid score: 5
