In [1]:
# %% Import Libraries
import os
import nltk
import random
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')




[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# %% File paths
TRAIN_FILE = './corpus/train.tsv'
LIWC_FILE = './SentimentLexicons/liwcdic2007.dic'

# Load LIWC Words
def read_words(file_path):
    poslist, neglist = [], []
    with open(file_path, encoding='latin1') as flexicon:
        wordlines = [line.strip() for line in flexicon]
        for line in wordlines:
            if not line == '':
                items = line.split()
                word, classes = items[0], items[1:]
                for c in classes:
                    if c == '126':  # Positive emotion
                        poslist.append(word)
                    elif c == '127':  # Negative emotion
                        neglist.append(word)
    return poslist, neglist
liwc_pos, liwc_neg = read_words(LIWC_FILE)

# Load Data
def load_data(file_path, limit=None):
    phrases, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as file:
        next(file)  # Skip header
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 4:
                phrases.append(parts[2])  # Phrase text
                labels.append(int(parts[3]))  # Sentiment label
                if limit and len(phrases) >= limit:
                    break
    return phrases, labels

# Preprocess Text
def preprocess_text(text):
    tokenizer = TreebankWordTokenizer()
    stop_words = set(stopwords.words('english'))
    tokens = tokenizer.tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

# Bag-of-Words Features
def bag_of_words_features(document, word_features):
    document_words = set(document)
    return {'contains({})'.format(word): (word in document_words) for word in word_features}

# LIWC Features
def liwc_features(document):
    pos_count = sum(1 for word in document if word in liwc_pos)
    neg_count = sum(1 for word in document if word in liwc_neg)
    return {'LIWC_positive': pos_count, 'LIWC_negative': neg_count}

# Combined Features
def combined_features(document, word_features):
    features = bag_of_words_features(document, word_features)
    features.update(liwc_features(document))
    return features

# Model Evaluation
def evaluate_model(y_true, y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))


In [3]:
# %% Load and preprocess data
train_phrases, train_labels = load_data(TRAIN_FILE, limit=10000)
train_phrases = [preprocess_text(phrase) for phrase in train_phrases]

# Extract Bag-of-Words Features
all_words = [word for phrase in train_phrases for word in phrase]
word_features = list(nltk.FreqDist(all_words))[:1500]

# Create Combined Features
combined_feature_sets = [
    (combined_features(phrase, word_features), label)
    for phrase, label in zip(train_phrases, train_labels)
]

# Convert Features to Scikit-Learn Format
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform([features for features, label in combined_feature_sets])
y = [label for _, label in combined_feature_sets]

# Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:
#do not run
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

In [5]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'class_weight': ['balanced']
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1_macro', cv=3)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)


{'class_weight': 'balanced', 'max_depth': 30, 'n_estimators': 300}


In [6]:
# %% Logistic Regression
clf_logreg = LogisticRegression(class_weight='balanced', max_iter=500, solver='lbfgs', multi_class='multinomial')
clf_logreg.fit(X_train, y_train)
predictions_logreg = clf_logreg.predict(X_test)
print("Logistic Regression Results:")
evaluate_model(y_test, predictions_logreg)

# %% Random Forest
clf_rf = RandomForestClassifier(class_weight='balanced', n_estimators=100)
clf_rf.fit(X_train, y_train)
predictions_rf = clf_rf.predict(X_test)
print("\nRandom Forest Results:")
evaluate_model(y_test, predictions_rf)

# %% Naive Bayes
random.shuffle(combined_feature_sets)
train_size = int(0.8 * len(combined_feature_sets))
train_set, test_set = combined_feature_sets[:train_size], combined_feature_sets[train_size:]
classifier_nb = nltk.NaiveBayesClassifier.train(train_set)
print("\nNaive Bayes Accuracy:", nltk.classify.util.accuracy(classifier_nb, test_set))




Logistic Regression Results:
Confusion Matrix:
[[864  87 224   0   5]
 [134 591 280  32  18]
 [ 55 180 676 140  81]
 [ 27  37 292 648 153]
 [  5   6 202 103 798]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.73      0.76      1180
           1       0.66      0.56      0.60      1055
           2       0.40      0.60      0.48      1132
           3       0.70      0.56      0.62      1157
           4       0.76      0.72      0.74      1114

    accuracy                           0.63      5638
   macro avg       0.66      0.63      0.64      5638
weighted avg       0.66      0.63      0.64      5638


Random Forest Results:
Confusion Matrix:
[[960  70 142   3   5]
 [ 76 693 239  33  14]
 [ 16 116 870 116  14]
 [  6  33 259 754 105]
 [  9   7  99  75 924]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.81      0.85      1180
           1       0.75      0.66