In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/22/97/7db72a0beef1825f82188a4b923e62a146271ac2ced7928baa4d47ef2467/transformers-2.9.1-py3-none-any.whl (641kB)
[K     |████████████████████████████████| 645kB 3.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 15.7MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 21.3MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/3b/88/49e772d686088e1278766ad68a463513642a2a877487decbd691dec02955/sentencepiece-0.1.90-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |██████████

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [38]:
import os

import tensorflow as tf
import json
from typing import List, Optional, Union
from transformers import (
    XLMRobertaConfig,
    XLMRobertaForSequenceClassification,
    XLMRobertaTokenizer,
    TFXLMRobertaForSequenceClassification,
    InputExample,
    InputFeatures,
    PreTrainedTokenizer
)


# script parameters
BATCH_SIZE = 256
EVAL_BATCH_SIZE = BATCH_SIZE
USE_XLA = False
USE_AMP = False
EPOCHS = 3

TASK = "mrpc"
TFDS_TASK = TASK

num_labels = 5
print(num_labels)

tf.config.optimizer.set_jit(USE_XLA)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
config = XLMRobertaConfig.from_pretrained("xlm-roberta-base", num_labels=num_labels)
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = TFXLMRobertaForSequenceClassification.from_pretrained("jplu/tf-xlm-roberta-base", config=config)

def convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = 5,
    label_list=(1,2,3,4,5),
    output_mode="classification",
):

    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float]:
        if output_mode == "classification":
            return label_map[example.label]
        elif output_mode == "regression":
            return float(example.label)
        raise KeyError(output_mode)

    labels = [label_from_example(example) for example in examples]

    batch_encoding = tokenizer.batch_encode_plus(
        [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)

    def gen():
        for ex in features:
            yield (
                {
                    "input_ids": ex.input_ids,
                    "attention_mask": ex.attention_mask
                },
                ex.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None])
            },
            tf.TensorShape([]),
        ))


class WordEntry:
    def __init__(self, words, result):
        self.words = words
        self.result = result

    def __str__(self):
        return ({"words": self.words,
                 "result": self.result
                 }).__str__()

    def __repr__(self):
        return self.__str__()


def load_lt_grammar_dataset():
    data_file = open("/gdrive/My Drive/reviews4_large.txt", "r", encoding="utf-8")
    return json.load(data_file)

def load_input_examples_from_data(data : list):
    data = [WordEntry(x['description'], int(x['rating'])) for x in data]
    examples = []
    for entry in data:
        examples.append(InputExample(" ".join(entry.words), " ".join(entry.words), label=entry.result))
    return examples



5


In [0]:
dataset = load_lt_grammar_dataset()

In [0]:
import random

filtered_reviews = []

for review in dataset:
  if (review['rating'] == '5' and random.randrange(1, 10) > 4):
    filtered_reviews.append(review)
  elif (review['rating'] != '5'):
    filtered_reviews.append(review) 

In [30]:
len(filtered_reviews)

2568

In [40]:
# Load dataset via TensorFlow Datasets
data = load_input_examples_from_data(load_lt_grammar_dataset())
train_data = data[0:int(len(data)*0.8)]
train_data = train_data[0:BATCH_SIZE*150]
valid_data = data[int(len(data)*0.8):]
valid_data = valid_data[0:BATCH_SIZE*3]
train_examples = len(train_data)

# MNLI expects either validation_matched or validation_mismatched
valid_examples = len(valid_data)

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = convert_examples_to_features(train_data, tokenizer, max_length=5)

# MNLI expects either validation_matched or validation_mismatched
valid_dataset = convert_examples_to_features(valid_data, tokenizer, max_length=5)
train_dataset = train_dataset.shuffle(5).batch(BATCH_SIZE).repeat(-1)
valid_dataset = valid_dataset.shuffle(5).batch(EVAL_BATCH_SIZE)

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
if USE_AMP:
    # loss scaling is currently required when using mixed precision
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")


if num_labels == 1:
    loss = tf.keras.losses.MeanSquaredError()
else:
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=opt, loss=loss, metrics=[metric])

# Train and evaluate using tf.keras.Model.fit()
train_steps = int(train_examples // BATCH_SIZE)
valid_steps = int(valid_examples // EVAL_BATCH_SIZE)
print(train_steps)
print(valid_steps)


history = model.fit(
    train_dataset,
    epochs=EPOCHS,
    steps_per_epoch=train_steps
)

# Save TF2 model
os.makedirs("./save/", exist_ok=True)
model.save_pretrained("./save/")

# if TASK == "mrpc":
#     # Load the TensorFlow model in PyTorch for inspection
#     # This is to demo the interoperability between the two frameworks, you don't have to
#     # do this in real life (you can run the inference on the TF model).
#     pytorch_model = BertForSequenceClassification.from_pretrained("./save/", from_tf=True)
#
#     # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
#     sentence_0 = "This research was consistent with his findings."
#     sentence_1 = "His findings were compatible with this research."
#     sentence_2 = "His findings were not compatible with this research."
#     inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors="pt")
#     inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors="pt")
#
#     pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
#     pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
#     print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
#     print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")


12
3
Epoch 1/3
Epoch 2/3
Epoch 3/3
