Step 1: Setup and Read Data

In [1]:
import pandas as pd
import time

train_csv_path = 'Data/train.csv'
test_csv_path = 'Data/test.csv'

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

reviews_train = train_df['review'].values
sentiments_train = train_df['sentiment'].values
reviews_test = test_df['review'].values
sentiments_test = test_df['sentiment'].values


Step 2: Define Lexicon-based Sentiment Analysis Functions

In [2]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk import pos_tag, word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from afinn import Afinn
import nltk
nltk.download('sentiwordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Helper function to map NLTK's part-of-speech tags to WordNet's part-of-speech tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return None

# Define function for SentiWordNet
def sentiwordnet_score(review):
    lemmatizer = WordNetLemmatizer()
    sentiment_score = 0
    
    # Tokenize and POS tag the review
    tokens = word_tokenize(review)
    tagged = pos_tag(tokens)
    
    for word, tag in tagged:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag is None:
            continue
        
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        if not lemma:
            continue
        
        synsets = wn.synsets(lemma, pos=wn_tag)
        if not synsets:
            continue
        
        # Take the first sense, the most common
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        
        sentiment_score += swn_synset.pos_score() - swn_synset.neg_score()
    
    # Return 'positive' if the sentiment score is positive, else 'negative'
    return 'positive' if sentiment_score > 0 else 'negative'

# Define function for AFINN
afinn = Afinn()
def afinn_score(review):
    return 'positive' if afinn.score(review) > 0 else 'negative'


[nltk_data] Downloading package sentiwordnet to C:\Users\Abdullah
[nltk_data]     Maqsood\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Abdullah
[nltk_data]     Maqsood\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Abdullah Maqsood\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to C:\Users\Abdullah
[nltk_data]     Maqsood\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Step 3: Preprocess Data for ML Models


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)


Step 4: Train Machine Learning Models


In [4]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Initialize models
gb_model = GradientBoostingClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
knn_model = KNeighborsClassifier()

# Record the start time 
start_time = time.time()

# Fit models
gb_model.fit(X_train, sentiments_train)
dt_model.fit(X_train, sentiments_train)
knn_model.fit(X_train, sentiments_train)

# Record the end time
end_time = time.time()


Step 5: Combine Predictions and Vote


In [5]:
# Define a mapping between sentiment labels and numeric values
sentiment_mapping = {'positive': 1, 'negative': 0}

import numpy as np
from scipy.stats import mode

# Get predictions from ML models
predictions_gb = gb_model.predict(X_test)
predictions_dt = dt_model.predict(X_test)
predictions_knn = knn_model.predict(X_test)

# Apply lexicon-based methods
predictions_swn = np.array([sentiwordnet_score(review) for review in reviews_test])
predictions_afinn = np.array([afinn_score(review) for review in reviews_test])

# Convert predictions from lexicon-based methods to numeric
predictions_swn_numeric = np.array([sentiment_mapping[pred] for pred in predictions_swn])
predictions_afinn_numeric = np.array([sentiment_mapping[pred] for pred in predictions_afinn])

# Convert predictions from ML models to numeric
predictions_gb_numeric = np.array([sentiment_mapping[pred] for pred in predictions_gb])
predictions_dt_numeric = np.array([sentiment_mapping[pred] for pred in predictions_dt])
predictions_knn_numeric = np.array([sentiment_mapping[pred] for pred in predictions_knn])

# Combine numeric predictions
combined_predictions_numeric = np.vstack((predictions_gb_numeric, predictions_dt_numeric, predictions_knn_numeric, predictions_swn_numeric, predictions_afinn_numeric))

# Determine the mode (most common prediction) across methods for each review
final_predictions_numeric, _ = mode(combined_predictions_numeric, axis=0)

# Reverse the mapping to convert numeric predictions back to string labels
reverse_sentiment_mapping = {v: k for k, v in sentiment_mapping.items()}

# Convert numeric predictions back to string labels
final_predictions = np.array([reverse_sentiment_mapping[pred] for pred in final_predictions_numeric.flatten()])



Step 6: Evaluate Accuracy


In [6]:
from sklearn.metrics import accuracy_score

# Calculate and print the training time
training_time = end_time - start_time
print(f'Training time: {training_time} seconds')

accuracy = accuracy_score(sentiments_test, final_predictions)
print(f'Hybrid model accuracy: {accuracy}')


Training time: 258.66820001602173 seconds
Hybrid model accuracy: 0.7876
