In [23]:
import os
import nltk
import random
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix
from nltk import NaiveBayesClassifier
from nltk.classify.util import accuracy
from sentiment_read_LIWC_pos_neg_words import read_words

# Download necessary NLTK resources
nltk.download('stopwords')

# File paths
TRAIN_FILE = './corpus/train.tsv'
LIWC_FILE = './SentimentLexicons/liwcdic2007.dic'

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
# Load data function
def load_data(file_path, limit=None):
    phrases = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        next(file)  # Skip header
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 4:
                phrases.append(parts[2])
                labels.append(int(parts[3]))
                if limit and len(phrases) >= limit:
                    break
    return phrases, labels

# Text preprocessing function
def preprocess_text(text):
    tokenizer = TreebankWordTokenizer()
    stop_words = set(stopwords.words('english'))
    tokens = tokenizer.tokenize(text.lower())
    cleaned_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return cleaned_tokens


In [27]:
def read_words(file_path):
    poslist = []
    neglist = []

    with open(file_path, encoding='latin1') as flexicon:
        # Read all LIWC words from file
        wordlines = [line.strip() for line in flexicon]
        for line in wordlines:
            if not line == '':
                items = line.split()
                word = items[0]
                classes = items[1:]
                for c in classes:
                    if c == '126':  # Positive emotion
                        poslist.append(word)
                    if c == '127':  # Negative emotion
                        neglist.append(word)
    return poslist, neglist

In [28]:
# Feature engineering: Bag-of-Words
def bag_of_words_features(document, word_features):
    document_words = set(document)
    features = {'contains({})'.format(word): (word in document_words) for word in word_features}
    return features

# Feature engineering: LIWC features
liwc_pos, liwc_neg = read_words(LIWC_FILE)
def liwc_features(document):
    pos_count = sum(1 for word in document if word in liwc_pos)
    neg_count = sum(1 for word in document if word in liwc_neg)
    return {'LIWC_positive': pos_count, 'LIWC_negative': neg_count}

# Combined features
def combined_features(document, word_features):
    features = bag_of_words_features(document, word_features)
    features.update(liwc_features(document))
    return features

# Evaluation function
def evaluate_model(classifier, test_set):
    gold_labels = [label for _, label in test_set]
    predicted_labels = [classifier.classify(features) for features, _ in test_set]
    print("\nConfusion Matrix:")
    print(confusion_matrix(gold_labels, predicted_labels))
    print("\nClassification Report:")
    print(classification_report(gold_labels, predicted_labels))


In [30]:
# Example of class weighting (manually adjust feature set duplication)
balanced_feature_sets = combined_feature_sets[:]
for features, label in combined_feature_sets:
    if label == 0 or label == 4:  # Oversample classes 0 and 4
        balanced_feature_sets.append((features, label))


In [32]:
# Calculate class weights
from collections import Counter

label_counts = Counter([label for _, label in combined_feature_sets])
total_count = sum(label_counts.values())
class_weights = {label: total_count / count for label, count in label_counts.items()}

# Adjust LIWC features with class weights
def weighted_liwc_features(document, label, weights):
    pos_count = sum(1 for word in document if word in liwc_pos)
    neg_count = sum(1 for word in document if word in liwc_neg)
    return {
        'LIWC_positive': pos_count * weights[label],
        'LIWC_negative': neg_count * weights[label]
    }


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(documents):
    vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')
    X = vectorizer.fit_transform([' '.join(doc) for doc in documents])
    return X, vectorizer.get_feature_names_out()

# Example usage
tfidf_X, tfidf_feature_names = tfidf_features(train_phrases)


In [36]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

# Convert feature sets to scikit-learn compatible format
vectorizer = DictVectorizer(sparse=False)

# Prepare the feature sets
X = vectorizer.fit_transform([features for features, label in combined_feature_sets])
y = [label for _, label in combined_feature_sets]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Train Logistic Regression
clf = LogisticRegression(class_weight='balanced', max_iter=500, solver='lbfgs', multi_class='multinomial')
clf.fit(X_train, y_train)

# Make predictions
predictions = clf.predict(X_test)




In [37]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("\nClassification Report:")
print(classification_report(y_test, predictions))


Confusion Matrix:
[[ 32  18  11   0   1]
 [ 78 123  95  20   9]
 [ 45 157 756 129  41]
 [  4  24 121 162  77]
 [  0   2  11  21  63]]

Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.52      0.29        62
           1       0.38      0.38      0.38       325
           2       0.76      0.67      0.71      1128
           3       0.49      0.42      0.45       388
           4       0.33      0.65      0.44        97

    accuracy                           0.57      2000
   macro avg       0.43      0.53      0.45      2000
weighted avg       0.61      0.57      0.58      2000



In [38]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(class_weight='balanced')
clf.fit(X_train, y_train)


In [31]:
if __name__ == "__main__":
    # Load data
    train_phrases, train_labels = load_data(TRAIN_FILE, limit=10000)

    # Preprocess data
    train_phrases = [preprocess_text(phrase) for phrase in train_phrases]

    # Get word features
    all_words = [word for phrase in train_phrases for word in phrase]
    word_features = list(nltk.FreqDist(all_words))[:1500]

    # Create feature sets
    bow_feature_sets = [(bag_of_words_features(phrase, word_features), label) 
                        for phrase, label in zip(train_phrases, train_labels)]
    combined_feature_sets = [(combined_features(phrase, word_features), label) 
                             for phrase, label in zip(train_phrases, train_labels)]

    # Train-test split
    train_size = int(0.8 * len(combined_feature_sets))
    random.shuffle(bow_feature_sets)
    random.shuffle(combined_feature_sets)
    train_set, test_set = combined_feature_sets[:train_size], combined_feature_sets[train_size:]

    # Train Naive Bayes Classifier
    classifier = NaiveBayesClassifier.train(train_set)
    print("Classifier accuracy on test set:", accuracy(classifier, test_set))

    # Evaluate the model
    evaluate_model(classifier, test_set)

    # Show most informative features
    print("\nMost Informative Features:")
    classifier.show_most_informative_features(10)

    # Baseline experiment
    print("\nBaseline (Bag-of-Words only):")
    train_set, test_set = bow_feature_sets[:train_size], bow_feature_sets[train_size:]
    baseline_classifier = NaiveBayesClassifier.train(train_set)
    print("Baseline accuracy on test set:", accuracy(baseline_classifier, test_set))
    evaluate_model(baseline_classifier, test_set)


Classifier accuracy on test set: 0.6165

Confusion Matrix:
[[ 21  38  24   0   0]
 [ 25 102 183  15   3]
 [ 15  87 943  65  16]
 [  0  18 189 133  25]
 [  0   1  28  35  34]]

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.25      0.29        83
           1       0.41      0.31      0.36       328
           2       0.69      0.84      0.76      1126
           3       0.54      0.36      0.43       365
           4       0.44      0.35      0.39        98

    accuracy                           0.62      2000
   macro avg       0.48      0.42      0.44      2000
weighted avg       0.59      0.62      0.59      2000


Most Informative Features:
Most Informative Features
      contains(engaging) = True                4 : 2      =    150.0 : 1.0
     contains(wonderful) = True                4 : 2      =     96.2 : 1.0
      contains(visually) = True                0 : 2      =     85.5 : 1.0
   contains(incompetent) = True  