In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [72]:
import stanza
import fasttext
import pandas as pd
import sklearn.model_selection as skm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
import numpy as np
from sentence_transformers import SentenceTransformer

In [11]:
data = pd.read_csv('yelp_labelled.txt', header = None, sep = '\t')

In [20]:
x = data[0]
y = data[1]
x_train_t, x_temp, y_train, y_temp = skm.train_test_split(x, y, test_size=0.3, stratify=y, random_state=0)
x_val_t, x_test_t, y_val, y_test = skm.train_test_split(x_temp, y_temp, test_size=2/3, stratify=y_temp, random_state=0)
print(len(x_train_t), len(x_val_t), len(x_test_t))

700 100 200


In [25]:
ft = fasttext.load_model('cc.en.300.bin')



In [58]:
train_ft = [ft.get_sentence_vector(text) for text in x_train_t]
val_ft = [ft.get_sentence_vector(text) for text in x_val_t]
test_ft = [ft.get_sentence_vector(text) for text in x_test_t]
print(len(train_ft))

700


In [62]:
tfidf_vectorizer = TfidfVectorizer()
train_tfidf = tfidf_vectorizer.fit_transform(x_train_t).toarray()
val_tfidf = tfidf_vectorizer.transform(x_val_t).toarray()
test_tfidf = tfidf_vectorizer.transform(x_test_t).toarray()

In [69]:
param_grid = [
    {
        'penalty': ['l1', 'l2'],
        'C': np.logspace(-4, 4, 5),
        'solver': ['liblinear', 'saga'],
        'max_iter': [10000],
        'fit_intercept': [True, False],
        'class_weight': [None, 'balanced'],
        'l1_ratio': np.linspace(0, 1, 5)  # Only applicable with 'elasticnet' penalty
    },
    {
        'penalty': ['l2', 'none'],
        'C': np.logspace(-4, 4, 5),
        'solver': ['newton-cg', 'lbfgs', 'sag'],
        'max_iter': [100, 1000, 5000],
        'fit_intercept': [True, False],
        'class_weight': [None, 'balanced']
    }
]

In [70]:
best_score = 0
best_params = None

for group in param_grid:
    for penalty in group['penalty']:
        for C in group['C']:
            for solver in group['solver']:
                for max_iter in group['max_iter']:
                    for fit_intercept in group['fit_intercept']:
                        for class_weight in group['class_weight']:
                            if penalty == 'l1' and solver not in ['liblinear', 'saga']:
                                continue
                            if penalty == 'elasticnet' and solver != 'saga':
                                continue
                            if penalty == 'none' and solver in ['liblinear']:
                                continue
                            try:
                                logreg_ft = LogisticRegression(
                                    penalty=penalty,
                                    C=C,
                                    solver=solver,
                                    max_iter=max_iter,
                                    fit_intercept=fit_intercept,
                                    class_weight=class_weight
                                )
                                if 'l1_ratio' in group:
                                    for l1_ratio in group['l1_ratio']:
                                        logreg_ft.set_params(l1_ratio=l1_ratio)
                                        logreg_ft.fit(train_ft, y_train)
                                        score = accuracy_score(y_val, logreg_ft.predict(val_ft))
                                        if score > best_score:
                                            best_score = score
                                            best_params = logreg_ft.get_params()
                                else:
                                    logreg_ft.fit(train_ft, y_train)
                                    score = accuracy_score(y_val, logreg_ft.predict(val_ft))
                                    if score > best_score:
                                        best_score = score
                                        best_params = logreg_ft.get_params()
                            except ValueError:
                                continue

print("Best Score:", best_score)
print("Best Parameters:", best_params)



Best Score: 0.8
Best Parameters: {'C': 10000.0, 'class_weight': None, 'dual': False, 'fit_intercept': False, 'intercept_scaling': 1, 'l1_ratio': 0.0, 'max_iter': 10000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [65]:
logreg_final = LogisticRegression(C=10000.0, class_weight=None, dual=False, fit_intercept=False, intercept_scaling=1, l1_ratio=0.0, max_iter=100, multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
logreg_final.fit(train_ft, y_train)

test_predictions = logreg_final.predict(test_ft)

final_test_accuracy = accuracy_score(y_test, test_predictions)

print("Final Test Accuracy:", final_test_accuracy)

Final Test Accuracy: 0.765




In [68]:
param_grid = {
    'C': np.logspace(-4, 4, 5),
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['saga'],
    'max_iter': [10000],
    'class_weight': [None, 'balanced'],
}

best_score = 0
best_params = {}

for C in param_grid['C']:
    for penalty in param_grid['penalty']:
        for solver in param_grid['solver']:
            for max_iter in param_grid['max_iter']:
                for class_weight in param_grid['class_weight']:
                    try:
                        model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=max_iter, class_weight=class_weight)
                        model.fit(train_tfidf, y_train)
                        val_predictions = model.predict(val_tfidf)
                        score = accuracy_score(y_val, val_predictions)

                        if score > best_score:
                            best_score = score
                            best_params = {'C': C, 'penalty': penalty, 'solver': solver, 'max_iter': max_iter, 'class_weight': class_weight}
                    except ValueError as e:
                        continue

print("Best Score:", best_score)
print("Best Parameters:", best_params)

Best Score: 0.82
Best Parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 10000, 'class_weight': None}


In [71]:
logreg_final = LogisticRegression(C=1.0, penalty='l2', solver='saga', max_iter=10000, class_weight=None)
logreg_final.fit(train_tfidf, y_train)

test_predictions = logreg_final.predict(test_tfidf)

final_test_accuracy = accuracy_score(y_test, test_predictions)

print("Final Test Accuracy:", final_test_accuracy)

Final Test Accuracy: 0.815


In [73]:
minilm = SentenceTransformer('all-MiniLM-L6-v2')
mpnet = SentenceTransformer('all-mpnet-base-v2')

modules.json: 100%|█████████████████████████████| 349/349 [00:00<00:00, 527kB/s]
config_sentence_transformers.json: 100%|████████| 116/116 [00:00<00:00, 425kB/s]
README.md: 100%|███████████████████████████| 10.6k/10.6k [00:00<00:00, 8.06MB/s]
sentence_bert_config.json: 100%|██████████████| 53.0/53.0 [00:00<00:00, 120kB/s]
config.json: 100%|█████████████████████████████| 571/571 [00:00<00:00, 1.10MB/s]
pytorch_model.bin: 100%|█████████████████████| 438M/438M [02:02<00:00, 3.56MB/s]
tokenizer_config.json: 100%|████████████████████| 363/363 [00:00<00:00, 654kB/s]
vocab.txt: 100%|██████████████████████████████| 232k/232k [00:00<00:00, 683kB/s]
tokenizer.json: 100%|████████████████████████| 466k/466k [00:00<00:00, 1.03MB/s]
special_tokens_map.json: 100%|██████████████████| 239/239 [00:00<00:00, 440kB/s]
1_Pooling/config.json: 100%|████████████████████| 190/190 [00:00<00:00, 468kB/s]


In [78]:
x_train_minilm = minilm.encode(x_train_t.tolist())
x_val_minilm = minilm.encode(x_val_t.tolist())
x_test_minilm = minilm.encode(x_test_t.tolist())
x_train_mpnet = mpnet.encode(x_train_t.tolist())
x_val_mpnet = mpnet.encode(x_val_t.tolist())
x_test_mpnet = mpnet.encode(x_test_t.tolist())

In [80]:
C_values = [0.1, 1, 10]
penalties = ['l2', 'l1']

best_scores = {'minilm': 0, 'mpnet': 0}
best_configs = {'minilm': None, 'mpnet': None}
best_val_scores = {'minilm': 0, 'mpnet': 0}

def evaluate_models(x_train, y_train, x_val, y_val, x_test, y_test, model_name):
    global best_scores, best_configs, best_val_scores
    
    for C in C_values:
        for penalty in penalties:
            solver = 'liblinear' if penalty == 'l1' else 'saga'
            model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=5000)
            model.fit(x_train, y_train)
            score = accuracy_score(y_val, model.predict(x_val))
            if score > best_scores[model_name]:
                best_scores[model_name] = score
                best_val_scores[model_name] = score
                best_configs[model_name] = {'C': C, 'penalty': penalty, 'solver': solver}
    
    best_model = LogisticRegression(**best_configs[model_name], max_iter=5000)
    best_model.fit(np.vstack([x_train, x_val]), np.concatenate([y_train, y_val]))
    test_score = accuracy_score(y_test, best_model.predict(x_test))
    print(f"{model_name} - Best Validation Score: {best_val_scores[model_name]}, Best Test Score: {test_score}, Best Config: {best_configs[model_name]}")
evaluate_models(x_train_minilm, y_train, x_val_minilm, y_val, x_test_minilm, y_test, 'minilm')
evaluate_models(x_train_mpnet, y_train, x_val_mpnet, y_val, x_test_mpnet, y_test, 'mpnet')

minilm - Best Validation Score: 0.92, Best Test Score: 0.9, Best Config: {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}
mpnet - Best Validation Score: 0.96, Best Test Score: 0.95, Best Config: {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}


In [81]:
stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='sentiment,tokenize')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 2.52MB/s]
2024-03-03 23:51:28 INFO: Downloaded file to /Users/bekzat.ongdassynovicloud.com/stanza_resources/resources.json
2024-03-03 23:51:28 INFO: Downloading default packages for language: en (English) ...
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/d
2024-03-03 23:53:01 INFO: Downloaded file to /Users/bekzat.ongdassynovicloud.com/stanza_resources/en/default.zip
2024-03-03 23:53:02 INFO: Finished downloading models and saved to /Users/bekzat.ongdassynovicloud.com/stanza_resources
2024-03-03 23:53:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 3.96MB/s]
2024-03-03 23:53:03 I


Accuracy with class 0 as neutral: 0.97


Accuracy with class 1 as neutral: 0.98



In [85]:
docs = [nlp(sent) for sent in x_val_t.tolist()]
classes=[0,1]
for c in classes:
    y_pred = [doc.sentences[0].sentiment for doc in docs]
    y_pred = [c if sentiment == 1 else sentiment for sentiment in y_pred]
    y_pred = [1 if sentiment == 2 else sentiment for sentiment in y_pred]
    accuracy = accuracy_score(y_val, y_pred)
    print(accuracy)

0.97
0.98


In [87]:
docs = [nlp(sent) for sent in x_test_t.tolist()]

y_pred = [1 if sentiment > 0 else 0 for sentiment in [doc.sentences[0].sentiment for doc in docs]]
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on test set: {accuracy}\n')

Accuracy on test set: 0.92

