In [84]:
import numpy as np
import pandas as pd
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split
from datasets import Dataset
from tensorflow.keras.optimizers import Adam
from evaluate import load

In [53]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.losses import categorical_crossentropy

# Loading the data

In [54]:
val_y_path = "C:/Users/Mia/Desktop/FINKI/NLP/nlp/lab3/shakespeare/valid.original.nltktok"
val_x_path = "C:/Users/Mia/Desktop/FINKI/NLP/nlp/lab3/shakespeare/valid.modern.nltktok"
train_y_path = "C:/Users/Mia/Desktop/FINKI/NLP/nlp/lab3/shakespeare/train.original.nltktok"
train_x_path = "C:/Users/Mia/Desktop/FINKI/NLP/nlp/lab3/shakespeare/train.modern.nltktok"
test_y_path = "C:/Users/Mia/Desktop/FINKI/NLP/nlp/lab3/shakespeare/test.original.nltktok"
test_x_path = "C:/Users/Mia/Desktop/FINKI/NLP/nlp/lab3/shakespeare/test.modern.nltktok"

In [55]:
val_y = pd.read_table(val_y_path, header=None, names=["Sentences"])
val_x = pd.read_table(val_x_path, header=None, names=["Sentences"])
test_y = pd.read_table(test_y_path, header=None, names=["Sentences"])
test_x = pd.read_table(test_x_path, header=None, names=["Sentences"])
train_y = pd.read_table(train_y_path, header=None, names=["Sentences"])
train_x = pd.read_table(train_x_path, header=None, names=["Sentences"])

In [56]:
df_x = pd.concat([pd.concat([train_x, test_x]), val_x])

In [57]:
df_y = pd.concat([pd.concat([train_y, test_y]), val_y])

# Define Functions

In [58]:
def create_transformers_train_data(sentences, translations, tokenizer):
    inputs_en = tokenizer(sentences, max_length=10, truncation=True)

    with tokenizer.as_target_tokenizer():
        outputs_es = tokenizer(translations, max_length=10, truncation=True)

    data = Dataset.from_dict({'input_ids': inputs_en['input_ids'],
                              'attention_mask': inputs_en['attention_mask'],
                              'labels': outputs_es['input_ids']})
    return data

In [59]:
def decode_with_transformer(sentence, tokenizer, model):
    tokens = tokenizer([sentence], return_tensors='np')
    out = model.generate(**tokens, max_length=10)

    with tokenizer.as_target_tokenizer():
        pred_sentence = tokenizer.decode(out[0], skip_special_tokens=True)

    return pred_sentence

# Create Vocabulary

In [60]:
prefix = 'translate from English to Shakespearean: '
model_name = 't5-small'

In [61]:
sentences_x = [prefix + sentence for sentence in df_x['Sentences'].values.tolist()]
sentences_y = df_y['Sentences'].values.tolist()

In [62]:
train_x, test_x, train_y, test_y = train_test_split(sentences_x, sentences_y,
                                                            test_size=0.1, random_state=0)

# Create Model

In [63]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [64]:
train_set = create_transformers_train_data(train_x, train_y, tokenizer)



In [65]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [66]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
                                           model=model_name,
                                           return_tensors='tf')

In [67]:
train_set = model.prepare_tf_dataset(train_set, collate_fn=data_collator)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [70]:
model.compile(Adam(learning_rate=0.01))

In [71]:
model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1601a977520>

In [79]:
model.save("C:/Users/Mia/Desktop/FINKI/NLP/nlp/lab3/models")



INFO:tensorflow:Assets written to: C:/Users/Mia/Desktop/FINKI/NLP/nlp/lab3/models\assets


INFO:tensorflow:Assets written to: C:/Users/Mia/Desktop/FINKI/NLP/nlp/lab3/models\assets


In [81]:
output_y = []
for sentence in test_x:
    pred = decode_with_transformer(sentence, tokenizer, model)
    output_y.append(pred)

UnimplementedError: Exception encountered when calling layer "SelfAttention" "                 f"(type TFT5Attention).

{{function_node __wrapped__XlaDynamicSlice_device_/job:localhost/replica:0/task:0/device:CPU:0}} Could not find compiler for platform Host: NOT_FOUND: could not find registered compiler for platform Host -- was support for that platform linked in? [Op:XlaDynamicSlice]

Call arguments received by layer "SelfAttention" "                 f"(type TFT5Attention):
  • hidden_states=tf.Tensor(shape=(1, 1, 512), dtype=float32)
  • mask=tf.Tensor(shape=(1, 1, 1, 2), dtype=float32)
  • key_value_states=None
  • position_bias=None
  • past_key_value=('tf.Tensor(shape=(1, 8, 1, 64), dtype=float32)', 'tf.Tensor(shape=(1, 8, 1, 64), dtype=float32)')
  • layer_head_mask=None
  • query_length=None
  • use_cache=True
  • training=False
  • output_attentions=False

In [82]:
input_x = test_x
output_y_gt = test_y
output_y_pred = output_y

In [83]:
for in_en, gt_es, pred_es in zip(input_x, output_y_gt, output_y_pred):
        print(f'Input sentence: {in_en}')
        print(f'GT translation: {gt_es}')
        print(f'Pred translation: {pred_es}')

In [85]:
metric = load('bleu')
results = metric.compute(predictions=output_y_pred, references=output_y_gt)
score = results['bleu']
print(f'BLEU score: {score}')

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

IndexError: list index out of range

In [86]:
from nltk.translate import meteor
from nltk import word_tokenize

In [87]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [88]:
m_score = 0.0
for hyp, ref in zip(output_y_gt[:20], output_y_pred):
        m_score += round(meteor([word_tokenize(hyp)], word_tokenize(ref)), 4)

In [89]:
print(m_score)

0.0
