<!-- @format -->

# Subtask 1 - PAPUGA


<!-- @format -->


In [None]:
import random


def get_reviews(file_name):

    with open(file_name, "r", encoding="utf-8") as file:
        return [line.rstrip() for line in file.readlines()]


def shuffle_data(data):
    random.shuffle(data)
    return data


def split_data(data):

    return [(s.split(" ")[0], " ".join(s.split(" ")[1:])) for s in data]



data = get_reviews("reviews.txt")
shuffled_data = shuffle_data(data)

lines = split_data(shuffled_data)

N = len(lines)
test_size = N // 4
train_size = N - test_size

train_lines = lines[:train_size]
test_lines = lines[test_size:]

print(test_lines[:2])
print(train_lines[:2])

[('BAD', 'Hotel nie powinien mieć ich 5.'), ('GOOD', 'Polecam i mam nadzieję, że taki poziom już pozostanie.')]
[('BAD', 'Mało cierpliwy i nie ma moim zdaniem dobrego podejścia do pacjentek.'), ('BAD', 'Bede musiał skorzystać z porady innego lekarza, ta wizyta nic nie wniosła, zwraca czasu i pieniędzy.')]


In [50]:
from sentence_probability import determitive_prob


def return_predictions(data):
    res = []
    for mark, rev in data:
        prob = determitive_prob(rev)
        y = 1 if mark == "GOOD" else 0
        res.append((prob, y))
    return res


predictions_train = return_predictions(train_lines)


predictions_test = return_predictions(test_lines)

[(0.12572303, 0), (0.5, 0), (0.5, 1), (0.75667584, 1), (0.28219464, 0)]

In [5]:
correct = 0

for y_pred, y in predictions:
    y_pred_res = 0
    if y_pred >= 0.5:
        y_pred_res = 1
    else:
        y_pred_res = 0
    if y_pred_res == y:
        correct += 1

correct / len(predictions)

0.61

In [62]:
from sklearn.metrics import accuracy_score

# predictions_test[0]
y_true = [y for x, y in predictions_test]
y_pred = [1 if x >= 0.5 else 0 for x, y in predictions_test]

y_true_test = [y for x, y in predictions_train]
y_pred_test = [1 if x >= 0.5 else 0 for x, y in predictions_train]

accuracy_p = accuracy_score(y_true, y_pred)
accuracy_p_test = accuracy_score(y_true_test, y_pred_test)

print(f"Accuracy Train: {accuracy_p:.2}")
print(f"Accuracy Test {accuracy_p_test:.2}")

Accuracy Train: 0.62
Accuracy Test 0.62


<!-- @format -->

# Subtask 2 - BERT


In [15]:
import torch
from transformers import AutoTokenizer, AutoModel

model_name = "allegro/herbert-base-cased"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer_B = AutoTokenizer.from_pretrained(model_name)
model_B = AutoModel.from_pretrained(model_name).to(device)

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def representation(L):
    txt = " ".join(L)
    input_ids = tokenizer_B(txt, return_tensors="pt")["input_ids"].to(device)
    output = model_B(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0, 0, :]



def get_reviews(file_name):


    with open(file_name, "r", encoding="utf-8") as file:
        return [line.rstrip() for line in file.readlines()]


lines = get_reviews("reviews.txt")



def BERT_predict(t_lines):
    X_t, y_t = [], []
    for mark, rev in t_lines:
        y = 0 if mark == "BAD" else 1
        x = representation(rev)
        X_t.append(x)
        y_t.append(y)
    return X_t, y_t


X_train_BERT, y_train_BERT = BERT_predict(train_lines)
X_test_BERT, y_test_BERT = BERT_predict(test_lines)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000, solver="lbfgs").fit(X_train_BERT, y_train_BERT)

print("Train accuracy:", clf.score(X_train_BERT, y_train_BERT))
print("Test accuracy:", clf.score(X_test_BERT, y_test_BERT))

Train accuracy: 0.9266666666666666
Test accuracy: 0.8


<!-- @format -->

# Subtask 3 - Logistic Regression


In [20]:
PAPUGA_PROBS_train = return_predictions(train_lines)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np


X_train = np.array([emb.flatten() for emb in X_train_BERT])
y_train = np.array([1 if prob >= 0.5 else 0 for prob, y in PAPUGA_PROBS_train])


X_test, y_test = [], []
for mark, rev in test_lines:
    y = 1 if mark == "GOOD" else 0

    X_test.append(representation(rev))
    y_test.append(y)

X_test = np.array(X_test)
y_test = np.array(y_test)

# print(X_test[0], y_test[0])
# print(X_train[0], y_train[0])

log_reg = LogisticRegression(max_iter=1000, solver="lbfgs")
log_reg.fit(X_train, y_train)


y_pred = log_reg.predict(X_test)

In [49]:
accuracy = accuracy_score(y_test, y_pred)
precission = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2}")
print(f"Precssion: {precission:.2}")
print(f"Recall: {recall:.2}")

Accuracy: 0.55
Precssion: 0.53
Recall: 0.99
