Step 1: Import Necessary Libraries


In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import sentiwordnet as swn
from afinn import Afinn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import nltk
import time

Step 2: Read the Dataset


In [2]:
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')

reviews_train = train_df['review'].values
sentiments_train = train_df['sentiment'].values
reviews_test = test_df['review'].values
sentiments_test = test_df['sentiment'].values


Step 3: Define Lexicon-based Functions


In [3]:
nltk.download('sentiwordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')


def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return None


def sentiwordnet_score(review):
    lemmatizer = WordNetLemmatizer()
    sentiment_score = 0

    tokens = word_tokenize(review)
    tagged = pos_tag(tokens)

    for word, tag in tagged:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag is None:
            continue

        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        if not lemma:
            continue

        synsets = wn.synsets(lemma, pos=wn_tag)
        if not synsets:
            continue

        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())

        sentiment_score += swn_synset.pos_score() - swn_synset.neg_score()

    return 'positive' if sentiment_score > 0 else 'negative'


afinn = Afinn()

def afinn_score(review):
    score = afinn.score(review)
    return 'positive' if score > 0 else 'negative'


[nltk_data] Downloading package sentiwordnet to C:\Users\Abdullah
[nltk_data]     Maqsood\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Abdullah
[nltk_data]     Maqsood\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Abdullah Maqsood\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to C:\Users\Abdullah
[nltk_data]     Maqsood\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Step 4: Preprocess Data for ML Models


In [4]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(reviews_train)
X_test_tfidf = vectorizer.transform(reviews_test)


Step 5: Train Base Classifiers and Prepare Stacking Features


In [5]:
# Initialize base classifiers
gb_model = GradientBoostingClassifier(random_state=42)
knn_model = KNeighborsClassifier()

# Start time for base classifiers
start_time_base = time.time()

# Fit base classifiers
gb_model.fit(X_train_tfidf, sentiments_train)
knn_model.fit(X_train_tfidf, sentiments_train)

# End time for base classifiers
end_time_base = time.time()

# Generate predictions for stacking features
gb_predictions_train = gb_model.predict_proba(X_train_tfidf)[:, 1]
knn_predictions_train = knn_model.predict_proba(X_train_tfidf)[:, 1]
swn_predictions_train = np.array([sentiwordnet_score(review) for review in reviews_train])  
afinn_predictions_train = np.array([afinn_score(review) for review in reviews_train]) 

# Stack predictions to create new feature set for the meta-classifier
stacked_features_train = np.column_stack((gb_predictions_train, knn_predictions_train, swn_predictions_train, afinn_predictions_train))

# Define a mapping between sentiment labels and numeric values
sentiment_mapping = {'positive': 1, 'negative': 0}

# Convert lexicon-based method predictions to numeric for the training set
swn_predictions_train_numeric = np.array([sentiment_mapping[pred] for pred in swn_predictions_train])
afinn_predictions_train_numeric = np.array([sentiment_mapping[pred] for pred in afinn_predictions_train])

# Stack all numeric predictions to create new feature set for the meta-classifier
stacked_features_train = np.column_stack((
    gb_predictions_train, 
    knn_predictions_train, 
    swn_predictions_train_numeric, 
    afinn_predictions_train_numeric
))


Step 6: Train the Meta-classifier


In [6]:
# Initialize and fit the meta-classifier
meta_classifier = LogisticRegression()

# Start time for meta classifier
start_time_meta = time.time()

meta_classifier.fit(stacked_features_train, sentiments_train)

# End time for meta classifier
end_time_meta = time.time()

Step 7: Prepare Test Features and Evaluate the Model


In [7]:
# Generate predictions for the test set
gb_predictions_test = gb_model.predict_proba(X_test_tfidf)[:, 1]
knn_predictions_test = knn_model.predict_proba(X_test_tfidf)[:, 1]
swn_predictions_test = np.array([sentiwordnet_score(review) for review in reviews_test])  
afinn_predictions_test = np.array([afinn_score(review) for review in reviews_test])  

# Convert lexicon-based method predictions to numeric for the test set
swn_predictions_test_numeric = np.array([sentiment_mapping[pred] for pred in swn_predictions_test])
afinn_predictions_test_numeric = np.array([sentiment_mapping[pred] for pred in afinn_predictions_test])

# Stack all numeric predictions to create new feature set for the meta-classifier
stacked_features_test = np.column_stack((
    gb_predictions_test, 
    knn_predictions_test, 
    swn_predictions_test_numeric, 
    afinn_predictions_test_numeric
))

# Predict using the meta-classifier
final_predictions = meta_classifier.predict(stacked_features_test)

# Evaluate accuracy
accuracy = accuracy_score(sentiments_test, final_predictions)
print(f'Stacking model accuracy: {accuracy}')

print(f'Training time for base classifiers: {end_time_base - start_time_base} seconds')
print(f'Training time for meta classifier: {end_time_meta - start_time_meta} seconds')


Stacking model accuracy: 0.8139
Training time for base classifiers: 225.01136016845703 seconds
Training time for meta classifier: 0.13895344734191895 seconds
