In [1]:
import csv
import stanza
import pandas as pd
import numpy as np
import sklearn.model_selection as skm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, confusion_matrix, ConfusionMatrixDisplay)
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('yelp_labelled.txt', header = None, sep = '\t')

In [3]:
x = data[0]
y = data[1]
x_train_t, x_temp, y_train, y_temp = skm.train_test_split(x, y, test_size=0.3, stratify=y, random_state=0)
x_val_t, x_test_t, y_val, y_test = skm.train_test_split(x_temp, y_temp, test_size=2/3, stratify=y_temp, random_state=0)

In [4]:
min_dfs = [1, 2, 3]
stop_word_options = [None, "english"]
binary_options = [True, False]
lowercase_options = [True, False]
alphas = [0.01, 0.1, 1.0]

best_acc = 0
best_config = {}

for min_df in min_dfs:
    for stop_words in stop_word_options:
        for binary in binary_options:
            for lowercase in lowercase_options:
                vectorizer = CountVectorizer(min_df=min_df, stop_words=stop_words, binary=binary, lowercase=lowercase)
                x_train = vectorizer.fit_transform(x_train_t)
                x_val = vectorizer.transform(x_val_t)
                for alpha in alphas:
                    clf = MultinomialNB(alpha=alpha)
                    clf.fit(x_train, y_train)
                    val_pred = clf.predict(x_val)
                    acc = accuracy_score(y_val, val_pred)
                    
                    if acc > best_acc:
                        best_acc = acc
                        best_config = { 'min_df': min_df,
                                        'stop_words': stop_words,
                                        'binary': binary,
                                        'lowercase': lowercase,
                                        'alpha': alpha }

vectorizer = CountVectorizer(min_df=best_config['min_df'], stop_words=best_config['stop_words'], binary=best_config['binary'], lowercase=best_config['lowercase'])
x_train = vectorizer.fit_transform(x_train_t)
x_test = vectorizer.transform(x_test_t)

clf = MultinomialNB(alpha=best_config['alpha'])
clf.fit(x_train, y_train)
test_pred = clf.predict(x_test)
test_acc = accuracy_score(y_test, test_pred)

print(f'best validation set accuracy: {best_acc}')
print(f'best configuration: {best_config}')
print(f'test set accuracy on best configuration: {test_acc}')

best validation set accuracy: 0.85
best configuration: {'min_df': 2, 'stop_words': None, 'binary': False, 'lowercase': True, 'alpha': 1.0}
test set accuracy on best configuration: 0.82


In [5]:
sentiwords = {}
with open('SentiWords_1.1.txt', 'r') as file:
    for line in file:
        if line.startswith('#'):
            continue
        parts = line.strip().split('\t')
        key, value = parts
        sentiwords[key] = float(value)

In [6]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, lemma', tokenize_no_ssplit = True)

2024-02-05 18:40:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json: 370kB [00:00, 10.0MB/s]
2024-02-05 18:40:59 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-02-05 18:40:59 INFO: Using device: cpu
2024-02-05 18:40:59 INFO: Loading: tokenize
2024-02-05 18:40:59 INFO: Loading: mwt
2024-02-05 18:40:59 INFO: Loading: pos
2024-02-05 18:40:59 INFO: Loading: lemma
2024-02-05 18:40:59 INFO: Done loading processors!


In [33]:
pos_mapping = { 'NOUN': 'n',
                'VERB': 'v',
                'ADJ': 'a',
                'ADV': 'r' }

def classify_sentiment(sentence, threshold):
    doc = nlp(sentence)
    sentiment_score = 0
    words_count = 0

    pred = []
    for sent in doc.sentences:
        sent_scores = []
        for word in sent.words:
            pos = pos_mapping.get(word.upos)
            if pos:
                key = f"{word.lemma}#{pos}"
                if key in sentiwords:
                    sentiment_score += sentiwords[key]
                    words_count += 1

        if words_count > 0:
            average_sentiment = sentiment_score / words_count
            if average_sentiment > 0: pred.append(1)
            else: pred.append(0)
        else:
            pred.append(0)
    return pred


In [34]:
best_threshold = 0.5
best_acc = 0.0

for threshold in np.arange(0, 1, 0.1):
    val_pred = classify_sentiment(x_val_t.tolist(), threshold)
    acc = accuracy_score(y_val.tolist(), val_pred)
    if acc > best_acc:
        best_acc = acc
        best_threshold = threshold
print(best_acc)

0.5


In [35]:
test_pred = classify_sentiment(x_test_t.tolist(), best_threshold)
acc = accuracy_score(y_test, test_pred)
print(acc)

0.505


In [36]:
print(sentence)

I wanna die from happiness
