In [16]:
import csv
import stanza
import pandas as pd
import numpy as np
import sklearn.model_selection as skm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay)
import matplotlib.pyplot as plt
from sklearn.svm import SVC

In [17]:
data = pd.read_csv('yelp_labelled.txt', header = None, sep = '\t')
print(len(data))

1000


In [18]:
x = data[0]
y = data[1]
x_train_t, x_temp, y_train, y_temp = skm.train_test_split(x, y, test_size=0.3, stratify=y, random_state=0)
x_val_t, x_test_t, y_val, y_test = skm.train_test_split(x_temp, y_temp, test_size=2/3, stratify=y_temp, random_state=0)
print(len(x_train_t), len(x_val_t), len(x_test_t))

700 100 200


In [19]:
clf = SVC(kernel='sigmoid', random_state=0)
vectorizer = CountVectorizer(min_df=2, stop_words=None, binary=False, lowercase=True)
x_train = vectorizer.fit_transform(x_train_t)
x_test = vectorizer.transform(x_test_t)
x_val = vectorizer.transform(x_val_t)
clf.fit(x_train, y_train)
y_pred_svc = clf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))

Accuracy: 0.795
              precision    recall  f1-score   support

           0       0.80      0.79      0.79       100
           1       0.79      0.80      0.80       100

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.79       200
weighted avg       0.80      0.80      0.79       200



In [20]:
min_dfs = [1, 2, 3]
stop_word_options = [None, "english"]
binary_options = [True, False]
lowercase_options = [True, False]
alphas = [0.01, 0.1, 1.0]

best_acc = 0
best_config = {}

for min_df in min_dfs:
    for stop_words in stop_word_options:
        for binary in binary_options:
            for lowercase in lowercase_options:
                vectorizer = CountVectorizer(min_df=min_df, stop_words=stop_words, binary=binary, lowercase=lowercase)
                x_train = vectorizer.fit_transform(x_train_t)
                x_val = vectorizer.transform(x_val_t)
                for alpha in alphas:
                    clf = MultinomialNB(alpha=alpha)
                    clf.fit(x_train, y_train)
                    val_pred = clf.predict(x_val)
                    acc = accuracy_score(y_val, val_pred)
                    
                    if acc > best_acc:
                        best_acc = acc
                        best_config = { 'min_df': min_df,
                                        'stop_words': stop_words,
                                        'binary': binary,
                                        'lowercase': lowercase,
                                        'alpha': alpha }

vectorizer = CountVectorizer(min_df=best_config['min_df'], stop_words=best_config['stop_words'], binary=best_config['binary'], lowercase=best_config['lowercase'])
x_train = vectorizer.fit_transform(x_train_t)
x_test = vectorizer.transform(x_test_t)

clf = MultinomialNB(alpha=best_config['alpha'])
clf.fit(x_train, y_train)
test_pred_nb = clf.predict(x_test)
test_acc = accuracy_score(y_test, test_pred_nb)

print(f'best validation set accuracy: {best_acc}')
print(f'best configuration: {best_config}')
print(f'test set accuracy on best configuration: {test_acc}')

best validation set accuracy: 0.85
best configuration: {'min_df': 2, 'stop_words': None, 'binary': False, 'lowercase': True, 'alpha': 1.0}
test set accuracy on best configuration: 0.82


In [21]:
sentiwords = {}
with open('SentiWords_1.1.txt', 'r') as file:
    for line in file:
        if line.startswith('#'):
            continue
        parts = line.strip().split('\t')
        key, value = parts
        sentiwords[key] = float(value)

In [22]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, lemma', tokenize_no_ssplit = True)

2024-02-08 15:52:00 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json: 370kB [00:00, 3.57MB/s]                    
2024-02-08 15:52:01 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-02-08 15:52:01 INFO: Using device: cpu
2024-02-08 15:52:01 INFO: Loading: tokenize
2024-02-08 15:52:01 INFO: Loading: mwt
2024-02-08 15:52:01 INFO: Loading: pos
2024-02-08 15:52:01 INFO: Loading: lemma
2024-02-08 15:52:01 INFO: Done loading processors!


In [23]:
pos_mapping = { 'NOUN': 'n',
                'VERB': 'v',
                'ADJ': 'a',
                'ADV': 'r' }

negations = { "not", "never", "no" }
intensifiers = { "very", "extremely", "highly" }
diminishers = { "somewhat", "slightly", "barely" }

def classify_sentiment(sentence, threshold):
    not_in_sentiwords = 0
    not_in_stanza = 0
    doc = nlp(sentence)
    pred = []
    for sent in doc.sentences:
        sentiment_score = 0
        words_count = 0
        negation_flag = False
        for word in sent.words:
            pos = pos_mapping.get(word.upos)
            if pos:
                key = f'{word.lemma}#{pos}'
                if key in sentiwords:
                    score_modifier = 1
                    if word.text in negations:
                        negation_flag = not negation_flag
                    elif word.text in intensifiers:
                        score_modifier = 1.5
                    elif word.text in diminishers:
                        score_modifier = 0.5
                    adjusted_score = sentiwords[key] * score_modifier
                    if negation_flag:
                        adjusted_score *= -1
                    sentiment_score += adjusted_score
                    words_count += 1
                    negation_flag = False
                else: not_in_sentiwords += 1
            else: not_in_stanza += 1
        if words_count > 0:
            average_sentiment = sentiment_score / words_count
            pred.append(1 if average_sentiment >= threshold else 0)
        else:
            pred.append(0)
    print(not_in_sentiwords, not_in_stanza)
    return pred


In [24]:
best_threshold = 0.0
best_acc = 0.0

for threshold in np.arange(-1, 1, 0.1):
    val_pred_mine = classify_sentiment(x_val_t.tolist(), threshold)
    acc_mine = accuracy_score(y_val.tolist(), val_pred_mine)
    if acc_mine > best_acc:
        best_acc = acc_mine
        best_threshold = threshold
print(best_acc)

28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
28 686
0.77


In [25]:
test_pred_class = classify_sentiment(x_test_t.tolist(), best_threshold)
acc_class = accuracy_score(y_test.tolist(), test_pred_class)
print(classification_report(y_test, test_pred))
print(acc)

48 1365
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       100
           1       0.83      0.80      0.82       100

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200

0.78


In [26]:
print(best_threshold)

0.09999999999999964


In [27]:
for i, x in enumerate(y_test):
    print(x, test_pred_nb[i], y_pred_svc[i], test_pred_class[i], x_test_t.iloc[i])

0 1 1 0 The Heart Attack Grill in downtown Vegas is an absolutely flat-lined excuse for a restaurant.
1 1 1 1 As always the evening was wonderful and the food delicious!
1 1 1 1 Wow very spicy but delicious.
1 0 1 0 Definitely worth venturing off the strip for the pork belly, will return next time I'm in Vegas.
1 1 1 0 Their menu is diverse, and reasonably priced.
1 0 1 1 All in all, I can assure you I'll be back.
1 0 1 1 The goat taco didn't skimp on the meat and wow what FLAVOR!
1 0 0 1 That said, our mouths and bellies were still quite pleased.
0 0 0 0 It lacked flavor, seemed undercooked, and dry.
0 0 0 0 If you want to wait for mediocre food and downright terrible service, then this is the place for you.
1 1 1 1 The ambience is wonderful and there is music playing.
0 0 0 0 Will not be back.
1 0 0 1 I didn't know pulled pork could be soooo delicious.
1 0 0 0 I promise they won't disappoint.
0 0 0 1 I've had better, not only from dedicated boba tea spots, but even from Jenni Pho.
0 