In [2]:
import pandas as pd
import sklearn.model_selection as skm

In [9]:
data = pd.read_csv('yelp_labelled.txt', header=None, sep='\t')
X_text = data[0]
labels = data[1]
X_text_t0, X_text_test, y_train_t0, y_test = skm.train_test_split(X_text, labels, test_size=1/5, stratify=labels, random_state=1234)
X_text_train, X_text_val, y_train, y_val = skm.train_test_split(X_text_t0, y_train_t0, test_size=1/8, stratify=y_train_t0, random_state=1234)

Fasttext

In [4]:
import fasttext
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')




In [5]:
# func to vecorize text using fasttext
def vectorize_texts(texts):
    return [ft.get_sentence_vector(text) for text in texts]

X_train = vectorize_texts(X_text_train)
X_val = vectorize_texts(X_text_val)
X_test = vectorize_texts(X_text_test)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

configuraions = [
    {'C': 0.1, 'penalty': 'l2'},
    {'C': 1, 'penalty': 'l2'},
    {'C': 10, 'penalty': 'l2'},
    {'C': 0.1, 'penalty': 'l1'},
    {'C': 1, 'penalty': 'l1'},
    {'C': 10, 'penalty': 'l1'}
]

best_config = None

for config in configuraions:
    lr = LogisticRegression(C=config['C'], penalty=config['penalty'], max_iter=10000, solver='liblinear')
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f'Accuracy for config {config}: {accuracy}')
    
    if best_config is None or accuracy > best_config['accuracy']:
        best_config = {'config': config, 'accuracy': accuracy}


print(f'\nBest config: {best_config}\n')
# testing on test set

lr = LogisticRegression(C=best_config['config']['C'], penalty=best_config['config']['penalty'], max_iter=10000, solver='liblinear')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on test set: {accuracy}')

Accuracy for config {'C': 0.1, 'penalty': 'l2'}: 0.64
Accuracy for config {'C': 1, 'penalty': 'l2'}: 0.72
Accuracy for config {'C': 10, 'penalty': 'l2'}: 0.8
Accuracy for config {'C': 0.1, 'penalty': 'l1'}: 0.5
Accuracy for config {'C': 1, 'penalty': 'l1'}: 0.67
Accuracy for config {'C': 10, 'penalty': 'l1'}: 0.8

Best config: {'config': {'C': 10, 'penalty': 'l2'}, 'accuracy': 0.8}

Accuracy on test set: 0.805


Using TfidfVectorizer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_text_train)
X_val = vectorizer.transform(X_text_val)
X_test = vectorizer.transform(X_text_test)

for config in configuraions:
    lr = LogisticRegression(C=config['C'], penalty=config['penalty'], max_iter=10000, solver='liblinear')
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f'Accuracy for config {config}: {accuracy}')
    
    if best_config is None or accuracy > best_config['accuracy']:
        best_config = {'config': config, 'accuracy': accuracy}
        
        
print(f'\nBest config: {best_config}\n')
# testing on test set

lr = LogisticRegression(C=best_config['config']['C'], penalty=best_config['config']['penalty'], max_iter=1000, solver='liblinear')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on test set: {accuracy}')


Accuracy for config {'C': 0.1, 'penalty': 'l2'}: 0.72
Accuracy for config {'C': 1, 'penalty': 'l2'}: 0.77
Accuracy for config {'C': 10, 'penalty': 'l2'}: 0.77
Accuracy for config {'C': 0.1, 'penalty': 'l1'}: 0.5
Accuracy for config {'C': 1, 'penalty': 'l1'}: 0.57
Accuracy for config {'C': 10, 'penalty': 'l1'}: 0.74

Best config: {'config': {'C': 10, 'penalty': 'l2'}, 'accuracy': 0.8}

Accuracy on test set: 0.805




In [11]:
data = pd.read_csv('yelp_labelled.txt', header=None, sep='\t')
X_text = data[0]
y = data[1]
X_text_t0, X_text_test, y_train_t0, y_test = skm.train_test_split(X_text, y, test_size=0.2, stratify=y, random_state=1234)
X_text_train, X_text_val, y_train, y_val = skm.train_test_split(X_text_t0, y_train_t0, test_size=1/8, stratify=y_train_t0, random_state=1234)

In [25]:
def tunning_lr(x_train, y_train, x_val, y_val, x_test, y_test, configuraions):
    best_config = None
    for config in configuraions:
        lr = LogisticRegression(C=config['C'], penalty=config['penalty'], max_iter=10000, solver='liblinear')
        lr.fit(x_train, y_train)
        y_pred = lr.predict(x_val)
        accuracy = accuracy_score(y_val, y_pred)
        print(f'Accuracy for config {config}: {accuracy}')
        if best_config is None or accuracy > best_config['accuracy']:
            best_config = {'config': config, 'accuracy': accuracy}
    print(f'\nBest config: {best_config}\n')
    lr = LogisticRegression(C=best_config['config']['C'], penalty=best_config['config']['penalty'], max_iter=10000, solver='liblinear')
    lr.fit(x_train, y_train)
    y_pred = lr.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy on test set: {accuracy}\n')

In [26]:
from sentence_transformers import SentenceTransformer

model1 = SentenceTransformer('all-MiniLM-L6-v2')
model2 = SentenceTransformer('all-mpnet-base-v2')

X_train_s = X_text_train.tolist()
X_val_s = X_text_val.tolist()

X_train1 = model1.encode(X_train_s)
X_val1 = model1.encode(X_val_s)

X_train2 = model2.encode(X_train_s)
X_val2 = model2.encode(X_val_s)

tunning_lr(X_train1, y_train, X_val1, y_val, X_val1, y_val, configuraions)
tunning_lr(X_train2, y_train, X_val2, y_val, X_val2, y_val, configuraions)


Accuracy for config {'C': 0.1, 'penalty': 'l2'}: 0.89
Accuracy for config {'C': 1, 'penalty': 'l2'}: 0.91
Accuracy for config {'C': 10, 'penalty': 'l2'}: 0.88
Accuracy for config {'C': 0.1, 'penalty': 'l1'}: 0.5
Accuracy for config {'C': 1, 'penalty': 'l1'}: 0.92
Accuracy for config {'C': 10, 'penalty': 'l1'}: 0.85

Best config: {'config': {'C': 1, 'penalty': 'l1'}, 'accuracy': 0.92}

Accuracy on test set: 0.92

Accuracy for config {'C': 0.1, 'penalty': 'l2'}: 0.93
Accuracy for config {'C': 1, 'penalty': 'l2'}: 0.92
Accuracy for config {'C': 10, 'penalty': 'l2'}: 0.92
Accuracy for config {'C': 0.1, 'penalty': 'l1'}: 0.5
Accuracy for config {'C': 1, 'penalty': 'l1'}: 0.92
Accuracy for config {'C': 10, 'penalty': 'l1'}: 0.93

Best config: {'config': {'C': 0.1, 'penalty': 'l2'}, 'accuracy': 0.93}

Accuracy on test set: 0.93



In [31]:
import stanza

stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

docs = [nlp(sentence) for sentence in X_val_s]

# Using the validation set, decide which class to assign to the texts classified as neutral
classes = [0, 1] # 0: negative, 1: positive

# stanza gives 0 for negative, 1 for neutral and 2 for positive, so we have to change the neutral class and change postive class to 1
for class_ in classes:
    y_pred = [doc.sentences[0].sentiment for doc in docs] # 0: negative, 1: neutral, 2: positive
    #changing neutral classes to class_
    y_pred = [class_ if sentiment == 1 else sentiment for sentiment in y_pred]
    #changing positive classes to 1
    y_pred = [1 if sentiment == 2 else sentiment for sentiment in y_pred]
    accuracy = accuracy_score(y_val, y_pred)
    print(f'\nAccuracy with class {class_} as neutral: {accuracy}\n')
    



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-03-03 21:52:06 INFO: Downloading default packages for language: en (English) ...
2024-03-03 21:52:07 INFO: File exists: C:\Users\Dastan\stanza_resources\en\default.zip
2024-03-03 21:52:25 INFO: Finished downloading models and saved to C:\Users\Dastan\stanza_resources.
2024-03-03 21:52:25 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-03-03 21:52:35 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| sentiment | sstplus  |

2024-03-03 21:52:35 INFO: Using device: cpu
2024-03-03 21:52:35 INFO: Loading: tokenize
2024-03-03 21:52:35 INFO: Loading: mwt
2024-03-03 21:52:35 INFO: Loading: sentiment
2024-03-03 21:52:36 INFO: Done loading processors!


Accuracy with class 0 as neutral: 0.98
Accuracy with class 1 as neutral: 0.95


Setting neutral texts to negative resulted in higher accuracy. Now we apply to the test set

In [32]:
docs = [nlp(sentence) for sentence in X_text_test.tolist()]

y_pred = [doc.sentences[0].sentiment for doc in docs]
y_pred = [0 if sentiment == 0 else 1 for sentiment in y_pred]
y_pred = [1 if sentiment == 2 else sentiment for sentiment in y_pred]

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on test set: {accuracy}\n')

Accuracy on test set: 0.96

