In [1]:
import os
USE_HPC=False
if USE_HPC:
    mycache_dir="/data/cmpe249-fa23/Huggingfacecache"
    os.environ['TRANSFORMERS_CACHE'] = mycache_dir
    os.environ['HF_HOME'] = mycache_dir
    os.environ['HF_DATASETS_CACHE'] = mycache_dir
    os.environ['http_proxy'] = "http://172.16.1.2:3128"
    os.environ['HTTP_PROXY'] = "http://172.16.1.2:3128"
    os.environ['https_proxy'] = "https://172.16.1.2:3128"
    os.environ['HTTPS_PROXY'] = "https://172.16.1.2:3128"
    trainoutput="/data/cmpe249-fa23/trainoutput/huggingface"
    taskname="eli5asksciencemodeling"
else:
    trainoutput="./output"
    taskname="eli5asksciencemodeling"

# Masked language modeling
Masked language modeling predicts a masked token in a sequence, and the model can attend to tokens bidirectionally. This means the model has full access to the tokens on the left and right. Masked language modeling is great for tasks that require a good contextual understanding of an entire sequence. BERT is an example of a masked language model.
https://huggingface.co/docs/transformers/tasks/masked_language_modeling

Finetune DistilRoBERTa on the r/askscience subset of the ELI5 dataset: https://huggingface.co/datasets/eli5

In [2]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")

Downloading builder script:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

Downloading and preparing dataset eli5/LFQA_reddit to C:/Users/lkk68/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa...


Downloading:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/576M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/286M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/36.2M [00:00<?, ?B/s]

Dataset eli5 downloaded and prepared to C:/Users/lkk68/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa. Subsequent calls will reuse this data.


In [3]:
eli5 = eli5.train_test_split(test_size=0.2)

In [4]:
eli5["train"][0]

{'q_id': 'iukwf',
 'title': 'Statics question',
 'selftext': "I'm taking statics and mechanics of materials and had a conceptual question that my professor didn't answer confidently. (new prof.) Anyway if you had say a cube in space and pushed it perpendicular to one side by the corner, would it experience translational motion and a moment or just translation motion?",
 'document': '',
 'subreddit': 'askscience',
 'answers': {'a_id': ['c26ra0j'],
  'text': ['A static force applied eccentric to the center of mass (c.m.) of an initially stationary cube in space would result in rotation and translation. In terms of statics, think about the "penalty" of "moving" the force *from* the corner over *to* the c.m., where this "penalty" is the moment.\nAs a terrestrial analogy, try pushing a ballon at an eccentric location.'],
  'score': [8]},
 'title_urls': {'url': []},
 'selftext_urls': {'url': []},
 'answers_urls': {'url': []}}

You’re only really interested in the text field (nested inside answers). What’s cool about language modeling tasks is you don’t need labels (also known as an unsupervised task) because the next word is the label.

In [5]:
#e xtract the text subfield from its nested structure with the flatten method:
eli5 = eli5.flatten()
eli5

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'title_urls.url', 'selftext_urls.url', 'answers_urls.url'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'title_urls.url', 'selftext_urls.url', 'answers_urls.url'],
        num_rows: 1000
    })
})

Each subfield is now a separate column as indicated by the answers prefix, and the text field is a list now

In [6]:
eli5["train"][0]

{'q_id': 'iukwf',
 'title': 'Statics question',
 'selftext': "I'm taking statics and mechanics of materials and had a conceptual question that my professor didn't answer confidently. (new prof.) Anyway if you had say a cube in space and pushed it perpendicular to one side by the corner, would it experience translational motion and a moment or just translation motion?",
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['c26ra0j'],
 'answers.text': ['A static force applied eccentric to the center of mass (c.m.) of an initially stationary cube in space would result in rotation and translation. In terms of statics, think about the "penalty" of "moving" the force *from* the corner over *to* the c.m., where this "penalty" is the moment.\nAs a terrestrial analogy, try pushing a ballon at an eccentric location.'],
 'answers.score': [8],
 'title_urls.url': [],
 'selftext_urls.url': [],
 'answers_urls.url': []}

In [7]:
from transformers import AutoTokenizer
modelname="distilroberta-base"
if USE_HPC:
    localpath=os.path.join(mycache_dir, modelname)
    tokenizer = AutoTokenizer.from_pretrained(localpath)
else:
    tokenizer = AutoTokenizer.from_pretrained(modelname)#, cache_dir=mycache_dir)

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [9]:
tokenizer(eli5["train"][0]['answers.text'])

{'input_ids': [[0, 347, 26593, 9866, 6, 8, 9, 768, 70, 8980, 3611, 32, 10, 5043, 9, 1164, 4, 1437, 616, 8825, 2368, 52, 206, 9, 9281, 7823, 25, 45059, 5224, 4748, 6, 42, 16, 444, 31, 1528, 4, 1437, 152, 16, 596, 2480, 40, 20147, 23, 10, 723, 5181, 23, 723, 9985, 36, 5412, 16, 7735, 8, 42, 16, 23, 18198, 3569, 6, 53, 1364, 25, 10, 205, 1246, 322, 50118, 50118, 11773, 18, 41, 1246, 9, 10, 48498, 14, 3374, 55, 11942, 2088, 223, 239, 9985, 35, 18134, 42703, 1215, 288, 47426, 50118, 50118, 15791, 1043, 9314, 33, 10, 30862, 45172, 14, 16, 10, 5043, 9, 5, 10875, 9, 80, 4204, 12957, 6, 98, 26640, 74, 7280, 14, 10875, 6, 2992, 24, 18, 30862, 45172, 4, 50118, 50118, 15248, 21491, 994, 32, 67, 10, 5043, 9, 49, 3989, 6, 98, 26640, 115, 483, 7, 10, 464, 89, 4, 50118, 50118, 100, 206, 5, 144, 505, 233, 74, 28, 14, 5, 3650, 9, 6214, 43951, 1022, 223, 1164, 6, 8, 14, 74, 22646, 17948, 5, 8917, 183, 4, 50118, 1215, 42703, 1215, 134, 1215, 2], [0, 133, 1164, 64, 464, 5, 3184, 9, 3183, 4, 9387, 59, 11720

In [8]:
examples=eli5["train"]

In [9]:
len(examples)

4000

In [10]:
listexamples = [" ".join(x) for x in examples["answers.text"]]

In [11]:
len(listexamples)

4000

In [12]:
token_train=tokenizer(listexamples)

Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors


In [13]:
token_train=tokenizer(listexamples, padding='max_length', truncation=True)

In [14]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        return self.tokenizer(
            [" ".join(x) for x in examples["answers.text"]],
            padding="max_length",
            truncation=True,
        )

In [15]:
tokenizer_wrapper = TokenizerWrapper(tokenizer)

In [16]:
tokenized_dataset = eli5.map(tokenizer_wrapper.tokenize_function, batched=True, num_proc=3, remove_columns=eli5["train"].column_names)

Map (num_proc=3):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

concatenate all the sequences
split the concatenated sequences into shorter chunks defined by block_size, which should be both shorter than the maximum input length and short enough for your GPU RAM.

In [18]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    #print(examples.keys())
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    #print('total_length:', total_length)
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [19]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [20]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4000
    })
})

Use the end-of-sequence token as the padding token and specify mlm_probability to randomly mask tokens each time you iterate over the data:

In [21]:
from transformers import DataCollatorForLanguageModeling

#tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [22]:
from transformers import AutoModelForMaskedLM
if USE_HPC:
    localpath=os.path.join(mycache_dir, modelname) #modelname="distilroberta-base"
    model = AutoModelForMaskedLM.from_pretrained(localpath)
else:
    model = AutoModelForMaskedLM.from_pretrained(modelname)#"distilroberta-base")

Downloading model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

In [23]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=os.path.join(trainoutput, modelname, taskname), #"./output/my_awesome_eli5_mlm_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=True,
)

In [24]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

Cloning https://huggingface.co/lkk688/eli5asksciencemodeling into local empty directory.


In [26]:
trainer.train()



  0%|          | 0/20000 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.331, 'learning_rate': 1.95e-05, 'epoch': 0.25}
{'loss': 2.231, 'learning_rate': 1.9e-05, 'epoch': 0.5}
{'loss': 2.2261, 'learning_rate': 1.8500000000000002e-05, 'epoch': 0.75}
{'loss': 2.2115, 'learning_rate': 1.8e-05, 'epoch': 1.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.992519497871399, 'eval_runtime': 7.6104, 'eval_samples_per_second': 525.598, 'eval_steps_per_second': 65.7, 'epoch': 1.0}
{'loss': 2.1583, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.25}
{'loss': 2.1447, 'learning_rate': 1.7e-05, 'epoch': 1.5}
{'loss': 2.1256, 'learning_rate': 1.65e-05, 'epoch': 1.75}
{'loss': 2.0949, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 2.0107181072235107, 'eval_runtime': 7.3769, 'eval_samples_per_second': 542.236, 'eval_steps_per_second': 67.78, 'epoch': 2.0}
{'loss': 2.1094, 'learning_rate': 1.55e-05, 'epoch': 2.25}
{'loss': 2.1017, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.5}
{'loss': 2.0655, 'learning_rate': 1.45e-05, 'epoch': 2.75}
{'loss': 2.105, 'learning_rate': 1.4e-05, 'epoch': 3.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.9571186304092407, 'eval_runtime': 7.1642, 'eval_samples_per_second': 558.33, 'eval_steps_per_second': 69.791, 'epoch': 3.0}
{'loss': 2.0274, 'learning_rate': 1.3500000000000001e-05, 'epoch': 3.25}
{'loss': 2.0416, 'learning_rate': 1.3000000000000001e-05, 'epoch': 3.5}
{'loss': 2.0081, 'learning_rate': 1.25e-05, 'epoch': 3.75}
{'loss': 2.0432, 'learning_rate': 1.2e-05, 'epoch': 4.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.9548934698104858, 'eval_runtime': 6.9702, 'eval_samples_per_second': 573.87, 'eval_steps_per_second': 71.734, 'epoch': 4.0}
{'loss': 1.9858, 'learning_rate': 1.15e-05, 'epoch': 4.25}
{'loss': 1.9906, 'learning_rate': 1.1000000000000001e-05, 'epoch': 4.5}
{'loss': 2.0183, 'learning_rate': 1.0500000000000001e-05, 'epoch': 4.75}
{'loss': 1.9822, 'learning_rate': 1e-05, 'epoch': 5.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.915711760520935, 'eval_runtime': 6.7874, 'eval_samples_per_second': 589.324, 'eval_steps_per_second': 73.665, 'epoch': 5.0}
{'loss': 1.9621, 'learning_rate': 9.5e-06, 'epoch': 5.25}
{'loss': 1.9472, 'learning_rate': 9e-06, 'epoch': 5.5}
{'loss': 1.945, 'learning_rate': 8.5e-06, 'epoch': 5.75}
{'loss': 1.9467, 'learning_rate': 8.000000000000001e-06, 'epoch': 6.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.9151719808578491, 'eval_runtime': 6.7893, 'eval_samples_per_second': 589.164, 'eval_steps_per_second': 73.645, 'epoch': 6.0}
{'loss': 1.9108, 'learning_rate': 7.500000000000001e-06, 'epoch': 6.25}
{'loss': 1.9699, 'learning_rate': 7e-06, 'epoch': 6.5}
{'loss': 1.934, 'learning_rate': 6.5000000000000004e-06, 'epoch': 6.75}
{'loss': 1.9089, 'learning_rate': 6e-06, 'epoch': 7.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.9423062801361084, 'eval_runtime': 6.8958, 'eval_samples_per_second': 580.064, 'eval_steps_per_second': 72.508, 'epoch': 7.0}
{'loss': 1.8791, 'learning_rate': 5.500000000000001e-06, 'epoch': 7.25}
{'loss': 1.9094, 'learning_rate': 5e-06, 'epoch': 7.5}
{'loss': 1.8813, 'learning_rate': 4.5e-06, 'epoch': 7.75}
{'loss': 1.9251, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.8912537097930908, 'eval_runtime': 6.9772, 'eval_samples_per_second': 573.297, 'eval_steps_per_second': 71.662, 'epoch': 8.0}
{'loss': 1.9213, 'learning_rate': 3.5e-06, 'epoch': 8.25}
{'loss': 1.8877, 'learning_rate': 3e-06, 'epoch': 8.5}
{'loss': 1.874, 'learning_rate': 2.5e-06, 'epoch': 8.75}
{'loss': 1.9104, 'learning_rate': 2.0000000000000003e-06, 'epoch': 9.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.8430209159851074, 'eval_runtime': 7.049, 'eval_samples_per_second': 567.457, 'eval_steps_per_second': 70.932, 'epoch': 9.0}
{'loss': 1.8848, 'learning_rate': 1.5e-06, 'epoch': 9.25}
{'loss': 1.8816, 'learning_rate': 1.0000000000000002e-06, 'epoch': 9.5}
{'loss': 1.8729, 'learning_rate': 5.000000000000001e-07, 'epoch': 9.75}
{'loss': 1.8856, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.8816810846328735, 'eval_runtime': 6.8146, 'eval_samples_per_second': 586.976, 'eval_steps_per_second': 73.372, 'epoch': 10.0}
{'train_runtime': 1486.8466, 'train_samples_per_second': 107.61, 'train_steps_per_second': 13.451, 'train_loss': 2.0059987731933595, 'epoch': 10.0}


TrainOutput(global_step=20000, training_loss=2.0059987731933595, metrics={'train_runtime': 1486.8466, 'train_samples_per_second': 107.61, 'train_steps_per_second': 13.451, 'train_loss': 2.0059987731933595, 'epoch': 10.0})

In [27]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/500 [00:00<?, ?it/s]

Perplexity: 6.64


In [28]:
text = "The Milky Way is a <mask> galaxy."

In [29]:
inputs = tokenizer(text, return_tensors="pt")

In [30]:
import torch
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_index

tensor([6])

In [31]:
model.device

device(type='cuda', index=0)

In [32]:
inputs=inputs.to('cuda')

In [33]:
logits = model(**inputs).logits

In [34]:
logits

tensor([[[ 3.5256, -4.3728,  5.6675,  ..., -2.0497, -1.7589,  5.0884],
         [ 4.5299, -3.5006, 13.1255,  ..., -0.5527,  0.3152,  6.1676],
         [-1.2306, -3.3158,  2.4637,  ..., -2.3856, -2.2163,  1.2203],
         ...,
         [-3.2669, -4.2269,  2.3295,  ..., -3.7995, -2.9306,  1.1487],
         [-4.2636, -4.8338,  7.9587,  ..., -2.0692, -1.4231, -1.2669],
         [ 3.5487, -4.9553, 20.0545,  ..., -0.8984, -4.0654,  5.8322]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [35]:
logits.shape

torch.Size([1, 10, 50265])

In [36]:
mask_token_logits = logits[0, mask_token_index, :]
mask_token_logits

tensor([[-3.6496, -3.9652,  3.4165,  ..., -2.4433, -4.3573,  0.9159]],
       device='cuda:0', grad_fn=<IndexBackward0>)

In [37]:
mask_token_logits.shape

torch.Size([1, 50265])

Then return the three masked tokens with the highest probability and print them out:

In [38]:
top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

In [39]:
top_3_tokens

[21300, 2232, 30794]

In [40]:
for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))

The Milky Way is a  spiral galaxy.
The Milky Way is a  massive galaxy.
The Milky Way is a  dwarf galaxy.


# Causal Language modeling
Causal language models are frequently used for text generation. Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on the left. This means the model cannot see future tokens. GPT-2 is an example of a causal language model.
https://huggingface.co/docs/transformers/tasks/language_modeling

Finetune DistilGPT2 on the r/askscience subset of the ELI5 dataset: https://huggingface.co/datasets/eli5

In [41]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")
eli5 = eli5.train_test_split(test_size=0.2)

Found cached dataset eli5 (C:/Users/lkk68/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [42]:
eli5 = eli5.flatten()

In [43]:
eli5["train"][0]

{'q_id': '14p632',
 'title': 'Can someone explain why this is a valid method of measuring the obesity rate?',
 'selftext': '',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['c7f5e1w'],
 'answers.text': ["The article is using information directly from the CDC: [Prevalence of Obesity Among Children and Adolescents: United States, Trends 1963-1965 Through 2007-2008](_URL_1_). As far as I can tell, the cutoff for obesity was fixed to the 2000 charts, so it makes sense to compare obesity rates across different years. Those charts are based on data gathered from 1963-1994. (You can find the charts and methods [here](_URL_0_).) Notice that it hits around the exact 5% obesity (which you'd expect using the 95th percentile) in the '70s."],
 'answers.score': [3],
 'title_urls.url': [],
 'selftext_urls.url': [],
 'answers_urls.url': ['http://www.cdc.gov/growthcharts/',
  'http://www.cdc.gov/nchs/data/hestat/obesity_child_07_08/obesity_child_07_08.htm']}

In [44]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

In [45]:
tokenizer.eos_token

'<|endoftext|>'

In [46]:
tokenizer.pad_token

Using pad_token, but it is not set yet.


In [52]:
tokenizer.pad_token = tokenizer.eos_token

In [48]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2") #different

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [49]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        return self.tokenizer(
            [" ".join(x) for x in examples["answers.text"]],
            padding="max_length",
            truncation=True,
        )

In [50]:
tokenizer_wrapper = TokenizerWrapper(tokenizer)

In [53]:
tokenized_dataset = eli5.map(tokenizer_wrapper.tokenize_function, batched=True, num_proc=3, remove_columns=eli5["train"].column_names)

Map (num_proc=3):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [54]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    #print(examples.keys())
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    #print('total_length:', total_length)
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [55]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [56]:
#use the same processed dataset used in MASKED LM
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 8000
    })
})

In [57]:
def addlabels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

In [58]:
lm_datasetlabels = lm_dataset.map(addlabels)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [59]:
lm_datasetlabels

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
})

Use the end-of-sequence token as the padding token and set mlm=False. This will use the inputs as labels shifted to the right by one element:

In [60]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [61]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model_gpt2 = AutoModelForCausalLM.from_pretrained("distilgpt2")

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [62]:
training_args = TrainingArguments(
    output_dir="./output/my_awesome_eli5_clm-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    num_train_epochs=3
)

trainer = Trainer(
    model=model_gpt2,
    args=training_args,
    train_dataset=lm_datasetlabels["train"],
    eval_dataset=lm_datasetlabels["test"],
    data_collator=data_collator,
)

In [63]:
trainer.train()



  0%|          | 0/12000 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 3.9358, 'learning_rate': 1.916666666666667e-05, 'epoch': 0.12}
{'loss': 3.9018, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.25}
{'loss': 3.8573, 'learning_rate': 1.7500000000000002e-05, 'epoch': 0.38}
{'loss': 3.8621, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.5}
{'loss': 3.847, 'learning_rate': 1.5833333333333333e-05, 'epoch': 0.62}
{'loss': 3.8301, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.75}
{'loss': 3.8177, 'learning_rate': 1.416666666666667e-05, 'epoch': 0.88}
{'loss': 3.7892, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


  0%|          | 0/1000 [00:00<?, ?it/s]

{'eval_loss': 3.701878309249878, 'eval_runtime': 16.7099, 'eval_samples_per_second': 478.758, 'eval_steps_per_second': 59.845, 'epoch': 1.0}
{'loss': 3.6895, 'learning_rate': 1.25e-05, 'epoch': 1.12}
{'loss': 3.6914, 'learning_rate': 1.1666666666666668e-05, 'epoch': 1.25}
{'loss': 3.699, 'learning_rate': 1.0833333333333334e-05, 'epoch': 1.38}
{'loss': 3.6817, 'learning_rate': 1e-05, 'epoch': 1.5}
{'loss': 3.6866, 'learning_rate': 9.166666666666666e-06, 'epoch': 1.62}
{'loss': 3.6908, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.75}
{'loss': 3.685, 'learning_rate': 7.500000000000001e-06, 'epoch': 1.88}
{'loss': 3.6747, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/1000 [00:00<?, ?it/s]

{'eval_loss': 3.686300277709961, 'eval_runtime': 16.6824, 'eval_samples_per_second': 479.548, 'eval_steps_per_second': 59.944, 'epoch': 2.0}
{'loss': 3.6389, 'learning_rate': 5.833333333333334e-06, 'epoch': 2.12}
{'loss': 3.6055, 'learning_rate': 5e-06, 'epoch': 2.25}
{'loss': 3.6351, 'learning_rate': 4.166666666666667e-06, 'epoch': 2.38}
{'loss': 3.6183, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}
{'loss': 3.6152, 'learning_rate': 2.5e-06, 'epoch': 2.62}
{'loss': 3.6212, 'learning_rate': 1.6666666666666667e-06, 'epoch': 2.75}
{'loss': 3.6252, 'learning_rate': 8.333333333333333e-07, 'epoch': 2.88}
{'loss': 3.6395, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/1000 [00:00<?, ?it/s]

{'eval_loss': 3.6846420764923096, 'eval_runtime': 16.7049, 'eval_samples_per_second': 478.9, 'eval_steps_per_second': 59.863, 'epoch': 3.0}
{'train_runtime': 780.2897, 'train_samples_per_second': 123.031, 'train_steps_per_second': 15.379, 'train_loss': 3.7224405619303385, 'epoch': 3.0}


TrainOutput(global_step=12000, training_loss=3.7224405619303385, metrics={'train_runtime': 780.2897, 'train_samples_per_second': 123.031, 'train_steps_per_second': 15.379, 'train_loss': 3.7224405619303385, 'epoch': 3.0})

In [64]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Perplexity: 39.83


trainer.push_to_hub()

In [65]:
prompt = "Somatic hypermutation allows the immune system to"

In [66]:
from transformers import pipeline

generator = pipeline("text-generation", model="distilgpt2")
generator(prompt)

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Somatic hypermutation allows the immune system to distinguish different pathogens such as leukaemia from leukaemia in humans. However, we did not investigate any of the possible pathogenic diseases in the immune system because of these issues.\n\n\n'}]

In [67]:
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [68]:
inputs=inputs.to('cuda')

In [69]:
inputs.device

device(type='cuda', index=0)

In [70]:
model_gpt2.device

device(type='cuda', index=0)

In [74]:
outputs = model_gpt2.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95, pad_token_id=generator.tokenizer.eos_token_id)

In [75]:
outputs

tensor([[   50, 13730,  8718,    76,  7094,  3578,   262, 10900,  1080,   284,
           787,   517,  4050, 13820,   621,   262,   517, 16378,  3513,   714,
           307,   329,   262,  5827,    13,   220,   770,   318,   257,  1180,
          3164,   422,   366,  3866,  1151,   425,  5010,   460,   470,   670,
             1,   357,  4480,   281,  4050,  2563,   852,   262,   717,    11,
           290,   262,  5827,   743,   307,   287,   262,  1218,   737,   220,
           770,  5983,   284,   517,  3871,  4203,  6563,   546,  2263,   257,
          2563,    13,   220,   770,    11,   287,   584,  2456,    11,  5983,
           284,   517,  3871,  4203,  1365,   546,  2263,  5010,    11,   543,
          2482,   287,   281,  2620,   287,   262,  1271,   286,  1744, 13820,
            13,   220,   770,  5983,   284,  3649,   511,  6628,   287,   428]],
       device='cuda:0')

In [76]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Somatic hypermutation allows the immune system to make more effective treatments than the more costly treatment could be for the patient.  This is a different approach from "preventive drugs can\'t work" (with an effective drug being the first, and the patient may be in the second).  This leads to more patients feeling confident about taking a drug.  This, in other words, leads to more patients feeling better about taking drugs, which results in an increase in the number of possible treatments.  This leads to increasing their confidence in this']