In [2]:
import random
import torch
import itertools
import numpy as np

from transformers import AutoTokenizer, AutoModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import functional as F
from transformers import logging
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

logging.set_verbosity_error()

model_name_encoder = "allegro/herbert-base-cased"
device = 'cuda'

In [3]:
tokenizer_encoder = AutoTokenizer.from_pretrained(model_name_encoder)
model_encoder = AutoModel.from_pretrained(model_name_encoder).to(device)

text = 'Bardzo lubię lody malinowe z bitą śmietaną.'

token_ids = tokenizer_encoder(text, return_tensors='pt')['input_ids'][0]

print ([tokenizer_encoder.decode(idx) for idx in token_ids])

tokenizer_config.json:   0%|          | 0.00/229 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/907k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/556k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

['<s>', 'Bardzo', 'lubię', 'lody', 'mali', 'nowe', 'z', 'bi', 'tą', 'śmietan', 'ą', '.', '</s>']


In [4]:
def representation(L):
    txt = ' '.join(L)
    input_ids = tokenizer_encoder(txt, return_tensors='pt')['input_ids'].to(device)
    output = model_encoder(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0,0,:]

In [5]:
representation(['Ala', 'ma', 'kota'])

array([ 7.75123611e-02, -1.26742795e-01,  1.28273174e-01,  6.88974857e-02,
        1.63994581e-02,  5.44659421e-02, -3.27508241e-01, -6.34695232e-01,
        9.20319483e-02,  8.05968642e-01,  1.59045756e-01,  2.29028508e-01,
        2.93846512e+00,  3.55168164e-01,  1.10074371e-01, -1.78765461e-01,
        1.20000631e-01,  7.92294275e-03, -1.32740542e-01,  2.68556684e-01,
        2.42710307e-01,  4.63390350e-02,  4.08300338e-03, -2.29847357e-01,
        1.29359066e-01,  2.12778866e-01,  2.44441584e-01,  8.24062228e-02,
        3.90837461e-01,  3.85150939e-01, -3.88434716e-02, -3.29763219e-02,
        6.87328950e-02, -4.60863337e-02,  1.64549291e-01, -5.30537553e-02,
       -2.92028993e-01, -7.47865140e-02, -3.22981179e-01,  2.10616335e-01,
       -4.49958332e-02, -2.33092010e-01, -1.98450804e-01,  4.04942811e-01,
        2.84321094e-03,  1.94603622e-01, -1.16448723e-01,  9.03226808e-02,
       -9.03956145e-02, -6.43575750e-03,  5.03770299e-02, -2.66933143e-01,
        1.24548480e-01, -

In [None]:
def spoil_maualy(L):
    replacements = {'ł': 'l', 'ą': 'a', 'ć': 'c', 'ę': 'e', 'ń': 'n', 'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'}
    res = []
    for w in L:
        temp = random.random()
        if temp < 0.25:
            res.append(w.upper())
        elif temp < 0.50:
            letter_iondex = random.randrange(0, len(w))
            new_letter = random.choice('abcdefghijklmnopqrstuvwxyz')
            w = w[:letter_iondex] + new_letter + w[letter_iondex+1:]
            res.append(w)
        elif temp < 0.75:
            w = ''.join(replacements.get(c, c) for c in w)
            res.append(w)
        else:
            res.append(w)
    return res

In [43]:
spoil_maualy(['Ala', 'ma', 'kota'])

['ALA', 'oa', 'kota']

In [23]:
model_decoder_name = 'eryk-mazus/polka-1.1b'
tokenizer_decoder = AutoTokenizer.from_pretrained(model_decoder_name)
model_decoder = AutoModelForCausalLM.from_pretrained(model_decoder_name).to(device)

In [None]:
def spoil_with_decode(L):
    prompt = f"""Zamień podaną opinię na nową, używając innych słów niż w oryginalnej opinii oraz zachowując przy tym wydźwięk i sens. Podkreśl wydźwięk oryginalnej opinii w zmienionej opinii. Niech opinia będzie krótkie.

Opinia: Bardzo lubię lody malinowe z bitą śmietaną.
Zamieniona opinia: Uwielbiam jeść lody malinowe z pyszną bitą śmietaną.

Opinia: {' '.join(L)}
Zamieniona opinia: """
    input_ids = tokenizer_decoder(prompt, return_tensors='pt')['input_ids'].to(device)
    output = model_decoder.generate(input_ids, max_length=300, num_return_sequences=1, do_sample=True)
    output_text = tokenizer_decoder.decode(output[0], skip_special_tokens=True)
    return output_text.replace(prompt, '')

In [124]:
spoil_with_decode('Te ciasteczka z posypką są naprawdę dobre.'.split())

'ładne desery są smaczne.'

In [6]:
def create_train_test(spoil_function, K=3):
    lines = open('../p1/reviews_for_task3.txt').readlines()
    random.shuffle(lines)

    N = len(lines)
    test_size = N // 4
    train_size = N - test_size

    train_lines = lines[:train_size]
    test_lines = lines[train_size:]

    X_train = []
    y_train = []
    X_test = []
    y_test = []

    for line in tqdm(train_lines):
        L = line.split()
        y = 0 if L[0] == 'BAD' else 1
        x = representation(L[1:])

        y_train.append(y)
        X_train.append(x)
        
        for i in range(K):
            x = representation(spoil_function(L[1:]))
            y_train.append(y)
            X_train.append(x)
        
        # if len(X_train) % 100 == 0:
        #     print (len(X_train))

    for line in tqdm(test_lines):
        L = line.split()
        y = 0 if L[0] == 'BAD' else 1
        
        x = representation(L[1:])
        y_test.append(y)
        X_test.append(x)
            
        # if len(X_test) % 100 == 0:
        #     print (len(X_test))
    return X_train, y_train, X_test, y_test

In [7]:
N = len(open('../p1/reviews_for_task3.txt').readlines())
test_size = N // 4
train_size = N - test_size

without spoiling

In [8]:
X_train, y_train, X_test, y_test = create_train_test(None, K=0)

100%|██████████| 300/300 [00:04<00:00, 74.00it/s]
100%|██████████| 100/100 [00:01<00:00, 74.80it/s]


In [9]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

print ('Train accuracy:', clf.score(X_train, y_train))
print ('Test accuracy:', clf.score(X_test, y_test))

Train accuracy: 1.0
Test accuracy: 0.75


manual

In [None]:
X_train, y_train, X_test, y_test = create_train_test(spoil_maualy, K=3)

100%|██████████| 300/300 [00:15<00:00, 18.84it/s]
100%|██████████| 100/100 [00:01<00:00, 76.43it/s]


In [None]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

print ('Train accuracy:', clf.score(X_train, y_train))
print ('Test accuracy:', clf.score(X_test, y_test))

Train accuracy: 0.9725
Test accuracy: 0.83


decoder

In [125]:

X_train, y_train, X_test, y_test = create_train_test(spoil_with_decode, K=3)

100%|██████████| 300/300 [26:17<00:00,  5.26s/it]
100%|██████████| 100/100 [00:01<00:00, 71.49it/s]


In [127]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

print ('Train accuracy:', clf.score(X_train, y_train))
print ('Test accuracy:', clf.score(X_test, y_test))

Train accuracy: 0.85
Test accuracy: 0.76


word2vec