In [12]:
import random
import torch
import itertools
import numpy as np

from transformers import AutoTokenizer, AutoModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import functional as F
from transformers import logging

logging.set_verbosity_error()

model_name_encoder = "allegro/herbert-base-cased"
device = 'cuda'

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name_encoder)
model_encoder = AutoModel.from_pretrained(model_name_encoder).to(device)

text = 'Bardzo lubię lody malinowe z bitą śmietaną.'

token_ids = tokenizer(text, return_tensors='pt')['input_ids'][0]

print ([tokenizer.decode(idx) for idx in token_ids])

['<s>', 'Bardzo', 'lubię', 'lody', 'mali', 'nowe', 'z', 'bi', 'tą', 'śmietan', 'ą', '.', '</s>']


In [14]:
def representation(L):
    txt = ' '.join(L)
    input_ids = tokenizer(txt, return_tensors='pt')['input_ids'].to(device)
    output = model_encoder(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0,0,:]

def spoil(L):
    res = []
    for w in L:
        if random.random() < 0.85:
            res.append(w)
        else:
            res.append(w.upper())
    return res

In [15]:
representation(['Ala', 'ma', 'kota'])

array([ 7.75123611e-02, -1.26742795e-01,  1.28273174e-01,  6.88974857e-02,
        1.63994581e-02,  5.44659421e-02, -3.27508241e-01, -6.34695232e-01,
        9.20319483e-02,  8.05968642e-01,  1.59045756e-01,  2.29028508e-01,
        2.93846512e+00,  3.55168164e-01,  1.10074371e-01, -1.78765461e-01,
        1.20000631e-01,  7.92294275e-03, -1.32740542e-01,  2.68556684e-01,
        2.42710307e-01,  4.63390350e-02,  4.08300338e-03, -2.29847357e-01,
        1.29359066e-01,  2.12778866e-01,  2.44441584e-01,  8.24062228e-02,
        3.90837461e-01,  3.85150939e-01, -3.88434716e-02, -3.29763219e-02,
        6.87328950e-02, -4.60863337e-02,  1.64549291e-01, -5.30537553e-02,
       -2.92028993e-01, -7.47865140e-02, -3.22981179e-01,  2.10616335e-01,
       -4.49958332e-02, -2.33092010e-01, -1.98450804e-01,  4.04942811e-01,
        2.84321094e-03,  1.94603622e-01, -1.16448723e-01,  9.03226808e-02,
       -9.03956145e-02, -6.43575750e-03,  5.03770299e-02, -2.66933143e-01,
        1.24548480e-01, -

In [16]:
model_decoder_name = 'eryk-mazus/polka-1.1b'

tokenizer = AutoTokenizer.from_pretrained(model_decoder_name)
model_decoder = AutoModelForCausalLM.from_pretrained(model_decoder_name).to(device)

In [None]:
def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label
            
def sentence_prob(sentence_txt):
    input_ids = tokenizer(sentence_txt, return_tensors='pt')['input_ids'].to(device)
    with torch.no_grad():
        output = model_decoder(input_ids=input_ids)
        log_probs = log_probs_from_logits(output.logits[:, :-1, :], input_ids[:, 1:])
        seq_log_probs = torch.sum(log_probs) / input_ids.size(1)
        # print(input_ids.size(0))
    return seq_log_probs.cpu().numpy()  

In [26]:
sentence_prob('Bardzo lubię lody malinowe z bitą śmietaną.')

1


array(-1.7937369, dtype=float32)

In [27]:
lines = open('../p1/reviews_for_task3.txt').readlines()
random.shuffle(lines)

N = len(lines)
test_size = N // 4
train_size = N - test_size

train_lines = lines[:train_size]
test_lines = lines[train_size:]

X_train = []
y_train = []
X_test = []
y_test = []

for line in train_lines:
    L = line.split()
    y = 0 if L[0] == 'BAD' else 1
    
    x = representation(L[1:])

    # another_one = np.array([0 if sentence_prob(' '.join(L[1:]) + ". Polecam.") < sentence_prob(' '.join(L[1:]) + ". Nie polecam.") else 1])
    another_two = np.array([sentence_prob(' '.join(L[1:]) + ". Polecam."), sentence_prob(' '.join(L[1:]) + ". Nie polecam.")])

    x = np.concatenate((x, another_two))
    y_train.append(y)
    X_train.append(x)
    
    for i in range(3):
        x = representation(spoil(L[1:]))
        x = np.concatenate((x, another_two))
        y_train.append(y)
        X_train.append(x)
    
    if len(X_train) % 100 == 0:
        print (len(X_train))

for line in test_lines:
    L = line.split()
    y = 0 if L[0] == 'BAD' else 1
    
    x = representation(L[1:])
    # another_one = np.array([0 if sentence_prob(' '.join(L[1:]) + ". Polecam") < sentence_prob(' '.join(L[1:]) + ". Nie polecam") else 1])
    another_two = np.array([sentence_prob(' '.join(L[1:]) + ". Polecam."), sentence_prob(' '.join(L[1:]) + ". Nie polecam.")])
    x = np.concatenate((x, another_two))
    y_test.append(y)
    X_test.append(x)
        
    if len(X_test) % 100 == 0:
        print (len(X_test))

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
100
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
200
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
300
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
400
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
500
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
600
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
700
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
800
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
900
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [28]:
N = len(lines)
test_size = N // 4
train_size = N - test_size

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm 

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

print ('Train accuracy:', clf.score(X_train, y_train))
print ('Test accuracy:', clf.score(X_test, y_test))

#Train accuracy: 0.9348939283101683
#Test accuracy: 0.8715697036223929

Train accuracy: 0.9641666666666666
Test accuracy: 0.66


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
from sklearn.ensemble import RandomForestClassifier

# Tworzenie i trenowanie modelu RandomForestClassifier
clf = RandomForestClassifier(random_state=0).fit(X_train, y_train)

# Ocena modelu
print('Train accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))

Train accuracy: 1.0
Test accuracy: 0.57


In [21]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Tworzenie i trenowanie modelu SVC
clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)

# Przewidywanie na zbiorze testowym
y_pred = clf.predict(X_test)

# Ocena modelu
print('Train accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', accuracy_score(y_test, y_pred))

Train accuracy: 0.9975
Test accuracy: 0.89


In [22]:
from sklearn.ensemble import GradientBoostingClassifier

# Tworzenie i trenowanie modelu Gradient Boosting
clf = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)

# Przewidywanie na zbiorze testowym
y_pred = clf.predict(X_test)

# Ocena modelu
print('Train accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', accuracy_score(y_test, y_pred))

Train accuracy: 0.9983333333333333
Test accuracy: 0.61


In [25]:
from sklearn.naive_bayes import GaussianNB

# Tworzenie i trenowanie modelu Naive Bayes
clf = GaussianNB().fit(X_train, y_train)

# Przewidywanie na zbiorze testowym
y_pred = clf.predict(X_test)

# Ocena modelu
print('Train accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', accuracy_score(y_test, y_pred))

Train accuracy: 0.59
Test accuracy: 0.57


In [26]:
from sklearn.neighbors import KNeighborsClassifier

# Tworzenie i trenowanie modelu KNN
clf = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)

# Przewidywanie na zbiorze testowym
y_pred = clf.predict(X_test)

# Ocena modelu
print('Train accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', accuracy_score(y_test, y_pred))

Train accuracy: 0.9866666666666667
Test accuracy: 0.78
