Step 1: Loading and Preparing the Data

In [1]:
import pandas as pd
import time

# Paths to the CSV files
train_csv_path = 'Data/train.csv'  
test_csv_path = 'Data/test.csv'    

# Read the datasets
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

reviews_train = train_df['review'].values
labels_train = train_df['sentiment'].values
reviews_test = test_df['review'].values
labels_test = test_df['sentiment'].values


Step 2: Text Preprocessing and Sentiment Analysis

In [2]:
from nltk.corpus import sentiwordnet as swn, wordnet as wn
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

def get_wordnet_pos(treebank_tag):
    """Converts treebank POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def calculate_sentiwordnet_score(review):
    lemmatizer = WordNetLemmatizer()
    sentiment_score = 0
    tokens = word_tokenize(review)
    tagged = pos_tag(tokens)
    
    for word, tag in tagged:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
            continue
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        if not lemma:
            continue
        synsets = wn.synsets(lemma, pos=wn_tag)
        if not synsets:
            continue
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        sentiment_score += swn_synset.pos_score() - swn_synset.neg_score()
    return sentiment_score


Step 3: Feature Engineering and Data Augmentation

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(reviews_train)

# Calculate sentiment scores
sentiment_scores = np.array([calculate_sentiwordnet_score(review) for review in reviews_train]).reshape(-1, 1)

# Augment TF-IDF features with sentiment scores
X_augmented = np.hstack((X_tfidf.toarray(), sentiment_scores))
X_augmented = normalize(X_augmented)  # Normalize features


Step 4: Model Training and Evaluation

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_augmented, labels_train, test_size=0.2, random_state=42)

# Train Logistic Regression model
lr_model = LogisticRegression()

# Record the start time
start_time = time.time()

lr_model.fit(X_train, y_train)

# Record the end time
end_time = time.time()

# Calculate and print the training time
training_time = end_time - start_time
print(f'Training time: {training_time} seconds')

# Predict and evaluate
predictions = lr_model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, predictions)}')


Training time: 3.0431196689605713 seconds
Accuracy: 0.8128333333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
