In [None]:
from transformers import pipeline, set_seed
from transformers import BioGptTokenizer, BioGptForCausalLM
import torch

device = 0 if torch.cuda.is_available() else -1
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=device)
set_seed(42)

In [None]:
generator("COVID is", max_length=20, num_return_sequences=5, do_sample=True)

In [None]:
from transformers import OPTForCausalLM, AutoTokenizer
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1
generator = pipeline('text-generation', model="facebook/opt-125m", device=device)
generator("COVID-19 is")

In [1]:
import transformers
from datasets import load_dataset
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset wikitext (/kuacc/users/oince22/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
100%|██████████| 3/3 [00:00<00:00, 280.02it/s]


In [2]:
datasets["train"][10]

{'text': ' The game \'s battle system , the BliTZ system , is carried over directly from Valkyira Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills unique to each character . They are divided into " Personal Potential " , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede

In [3]:
model_checkpoint = "facebook/opt-125m"

In [4]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [6]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Loading cached processed dataset at /kuacc/users/oince22/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-da8557135aeb74b1_*_of_00004.arrow
Loading cached processed dataset at /kuacc/users/oince22/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-f8bdc15c4a9bd38e_*_of_00004.arrow
Loading cached processed dataset at /kuacc/users/oince22/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-8b286c84fddf8650_*_of_00004.arrow


In [7]:
tokenized_datasets["train"][1]

{'input_ids': [2, 5457, 468, 44068, 6374, 41674, 6395, 5457, 1437, 50118],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
block_size = 100

In [9]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Loading cached processed dataset at /kuacc/users/oince22/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-678af6f1f88d58c8_*_of_00004.arrow
Loading cached processed dataset at /kuacc/users/oince22/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-9c78de5cc63e43ff_*_of_00004.arrow
Loading cached processed dataset at /kuacc/users/oince22/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-84367dd2ad526a0e_*_of_00004.arrow


In [11]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

' in the Valkyria series. Employing the same fusion of tactical and real @-@ time gameplay as its predecessors, the story runs parallel to the first game and follows the " Nameless ", a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven ". \n</s> The game began development in 2010, carrying over a large portion of the work done on Valkyria'

In [12]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [13]:
from transformers import Trainer, TrainingArguments

In [14]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-wikitext2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [16]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 94.76


In [16]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.3722,3.4192
2,3.1212,3.407074
3,2.9809,3.411147


TrainOutput(global_step=9102, training_loss=3.182782315023913, metrics={'train_runtime': 2223.3762, 'train_samples_per_second': 32.742, 'train_steps_per_second': 4.094, 'total_flos': 3715144012800000.0, 'train_loss': 3.182782315023913, 'epoch': 3.0})

In [17]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 30.30
