In [26]:
import os
USE_HPC=True
if USE_HPC:
    mycache_dir="/data/cmpe249-fa23/Huggingfacecache"
    os.environ['TRANSFORMERS_CACHE'] = mycache_dir
    os.environ['HF_HOME'] = mycache_dir
    os.environ['HF_DATASETS_CACHE'] = mycache_dir
    os.environ['http_proxy'] = "http://172.16.1.2:3128"
    os.environ['HTTP_PROXY'] = "http://172.16.1.2:3128"
    os.environ['https_proxy'] = "https://172.16.1.2:3128"
    os.environ['HTTPS_PROXY'] = "https://172.16.1.2:3128"
    trainoutput="/data/cmpe249-fa23/trainoutput/huggingface"
    taskname="eli5asksciencemodeling"
else:
    trainoutput="./output"
    taskname="eli5asksciencemodeling"

# Masked language modeling
Masked language modeling predicts a masked token in a sequence, and the model can attend to tokens bidirectionally. This means the model has full access to the tokens on the left and right. Masked language modeling is great for tasks that require a good contextual understanding of an entire sequence. BERT is an example of a masked language model.
https://huggingface.co/docs/transformers/tasks/masked_language_modeling

Finetune DistilRoBERTa on the r/askscience subset of the ELI5 dataset: https://huggingface.co/datasets/eli5

In [4]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")

Using the latest cached version of the module from /data/cmpe249-fa23/Huggingfacecache/modules/datasets_modules/datasets/eli5/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa (last modified on Sun Nov 12 14:31:59 2023) since it couldn't be found locally at eli5., or remotely on the Hugging Face Hub.


In [5]:
eli5 = eli5.train_test_split(test_size=0.2)

In [6]:
eli5["train"][0]

{'q_id': '215ylp',
 'title': 'Why do electronic instruments fail under high pressure, such as descending into jupiter?',
 'selftext': 'What is it in electronics that would make something like a resistor or a IC chip fail in the high pressure of a Jovian descent? They don\'t have an "internal pressure" that is overcome by atmospheric pressure, so what makes them fail?\n\nI can see why a descent into Venus would melt your circuit boards and your flux capacitors into silicon slag due to high *temperature*, but how does *pressure* wreak havoc on electronics?\n',
 'document': '',
 'subreddit': 'askscience',
 'answers': {'a_id': ['cga5vte', 'cga5mqj'],
  'text': ["Conductivity, and of course all electrical properties are a function of pressure.  While conventionally we think of solids as incompressible, this is far from true.  This is why ice will melt at a higher temperature at higher pressures (water is weird and this is atypical, but works as a good example).\n\nHere's an example of a res

You’re only really interested in the text field (nested inside answers). What’s cool about language modeling tasks is you don’t need labels (also known as an unsupervised task) because the next word is the label.

In [7]:
#e xtract the text subfield from its nested structure with the flatten method:
eli5 = eli5.flatten()
eli5

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'title_urls.url', 'selftext_urls.url', 'answers_urls.url'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'title_urls.url', 'selftext_urls.url', 'answers_urls.url'],
        num_rows: 1000
    })
})

Each subfield is now a separate column as indicated by the answers prefix, and the text field is a list now

In [8]:
eli5["train"][0]

{'q_id': '215ylp',
 'title': 'Why do electronic instruments fail under high pressure, such as descending into jupiter?',
 'selftext': 'What is it in electronics that would make something like a resistor or a IC chip fail in the high pressure of a Jovian descent? They don\'t have an "internal pressure" that is overcome by atmospheric pressure, so what makes them fail?\n\nI can see why a descent into Venus would melt your circuit boards and your flux capacitors into silicon slag due to high *temperature*, but how does *pressure* wreak havoc on electronics?\n',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['cga5vte', 'cga5mqj'],
 'answers.text': ["Conductivity, and of course all electrical properties are a function of pressure.  While conventionally we think of solids as incompressible, this is far from true.  This is why ice will melt at a higher temperature at higher pressures (water is weird and this is atypical, but works as a good example).\n\nHere's an example of a 

In [3]:
from transformers import AutoTokenizer
modelname="distilroberta-base"
if USE_HPC:
    localpath=os.path.join(mycache_dir, modelname)
    tokenizer = AutoTokenizer.from_pretrained(localpath)
else:
    tokenizer = AutoTokenizer.from_pretrained(modelname)#, cache_dir=mycache_dir)

In [9]:
tokenizer(eli5["train"][0]['answers.text'])

{'input_ids': [[0, 347, 26593, 9866, 6, 8, 9, 768, 70, 8980, 3611, 32, 10, 5043, 9, 1164, 4, 1437, 616, 8825, 2368, 52, 206, 9, 9281, 7823, 25, 45059, 5224, 4748, 6, 42, 16, 444, 31, 1528, 4, 1437, 152, 16, 596, 2480, 40, 20147, 23, 10, 723, 5181, 23, 723, 9985, 36, 5412, 16, 7735, 8, 42, 16, 23, 18198, 3569, 6, 53, 1364, 25, 10, 205, 1246, 322, 50118, 50118, 11773, 18, 41, 1246, 9, 10, 48498, 14, 3374, 55, 11942, 2088, 223, 239, 9985, 35, 18134, 42703, 1215, 288, 47426, 50118, 50118, 15791, 1043, 9314, 33, 10, 30862, 45172, 14, 16, 10, 5043, 9, 5, 10875, 9, 80, 4204, 12957, 6, 98, 26640, 74, 7280, 14, 10875, 6, 2992, 24, 18, 30862, 45172, 4, 50118, 50118, 15248, 21491, 994, 32, 67, 10, 5043, 9, 49, 3989, 6, 98, 26640, 115, 483, 7, 10, 464, 89, 4, 50118, 50118, 100, 206, 5, 144, 505, 233, 74, 28, 14, 5, 3650, 9, 6214, 43951, 1022, 223, 1164, 6, 8, 14, 74, 22646, 17948, 5, 8917, 183, 4, 50118, 1215, 42703, 1215, 134, 1215, 2], [0, 133, 1164, 64, 464, 5, 3184, 9, 3183, 4, 9387, 59, 11720

In [10]:
examples=eli5["train"]

In [11]:
len(examples)

4000

In [12]:
listexamples = [" ".join(x) for x in examples["answers.text"]]

In [13]:
len(listexamples)

4000

In [14]:
token_train=tokenizer(listexamples)

Token indices sequence length is longer than the specified maximum sequence length for this model (951 > 512). Running this sequence through the model will result in indexing errors


In [15]:
token_train=tokenizer(listexamples, padding='max_length', truncation=True)

In [16]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        return self.tokenizer(
            [" ".join(x) for x in examples["answers.text"]],
            padding="max_length",
            truncation=True,
        )

In [17]:
tokenizer_wrapper = TokenizerWrapper(tokenizer)

In [18]:
tokenized_dataset = eli5.map(tokenizer_wrapper.tokenize_function, batched=True, num_proc=3, remove_columns=eli5["train"].column_names)

Map (num_proc=3): 100%|██████████| 4000/4000 [00:01<00:00, 2627.16 examples/s]
Map (num_proc=3): 100%|██████████| 1000/1000 [00:00<00:00, 1669.12 examples/s]


In [19]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

concatenate all the sequences
split the concatenated sequences into shorter chunks defined by block_size, which should be both shorter than the maximum input length and short enough for your GPU RAM.

In [20]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    #print(examples.keys())
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    #print('total_length:', total_length)
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [21]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4): 100%|██████████| 4000/4000 [00:02<00:00, 1501.58 examples/s]
Map (num_proc=4): 100%|██████████| 1000/1000 [00:00<00:00, 2362.36 examples/s]


In [22]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4000
    })
})

Use the end-of-sequence token as the padding token and specify mlm_probability to randomly mask tokens each time you iterate over the data:

In [23]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [24]:
from transformers import AutoModelForMaskedLM
if USE_HPC:
    localpath=os.path.join(mycache_dir, modelname) #modelname="distilroberta-base"
    model = AutoModelForMaskedLM.from_pretrained(localpath)
else:
    model = AutoModelForMaskedLM.from_pretrained(modelname)#"distilroberta-base")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at /data/cmpe249-fa23/Huggingfacecache/distilroberta-base and are newly initialized: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=os.path.join(trainoutput, modelname, taskname), #"./output/my_awesome_eli5_mlm_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=True,
)

In [34]:
os.environ['http_proxy'] = "http://172.16.1.2:3128"
os.environ['HTTP_PROXY'] = "http://172.16.1.2:3128"
os.environ['https_proxy'] = ""
os.environ['HTTPS_PROXY'] = ""

In [35]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


ValueError: Token is required (write-access action) but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

In [66]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.992,0.954211
2,0.9878,0.944883
3,0.9639,0.919972


TrainOutput(global_step=6000, training_loss=1.0057990061442057, metrics={'train_runtime': 516.3088, 'train_samples_per_second': 92.968, 'train_steps_per_second': 11.621, 'total_flos': 1591461679104000.0, 'train_loss': 1.0057990061442057, 'epoch': 3.0})

In [67]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 2.52


In [68]:
text = "The Milky Way is a <mask> galaxy."

In [69]:
inputs = tokenizer(text, return_tensors="pt")

In [71]:
import torch
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_index

tensor([6])

In [72]:
model.device

device(type='cuda', index=0)

In [74]:
inputs=inputs.to('cuda')

In [75]:
logits = model(**inputs).logits

In [76]:
logits

tensor([[[ 3.5724,  4.5370,  4.9301,  ..., -2.8385, -1.0650,  5.4275],
         [ 2.4924,  1.4412, 11.5969,  ..., -1.1933,  0.2912,  6.0798],
         [-2.7016,  1.9855,  1.9075,  ..., -3.1717, -2.3135,  1.6989],
         ...,
         [-4.3938,  1.0695,  1.9510,  ..., -4.3306, -2.3445,  1.6941],
         [-4.5013,  1.1263,  8.4404,  ..., -3.8394, -2.0046, -0.3957],
         [ 1.5742, 12.4126,  8.0504,  ...,  0.6924,  0.8871,  6.6534]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [77]:
logits.shape

torch.Size([1, 10, 50265])

In [78]:
mask_token_logits = logits[0, mask_token_index, :]
mask_token_logits

tensor([[-3.2886, -1.1666,  3.3591,  ..., -2.2179, -3.5258,  1.5552]],
       device='cuda:0', grad_fn=<IndexBackward0>)

In [79]:
mask_token_logits.shape

torch.Size([1, 50265])

Then return the three masked tokens with the highest probability and print them out:

In [80]:
top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

In [81]:
top_3_tokens

[21300, 2232, 30794]

In [82]:
for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))

The Milky Way is a  spiral galaxy.
The Milky Way is a  massive galaxy.
The Milky Way is a  dwarf galaxy.


# Causal Language modeling
Causal language models are frequently used for text generation. Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on the left. This means the model cannot see future tokens. GPT-2 is an example of a causal language model.
https://huggingface.co/docs/transformers/tasks/language_modeling

Finetune DistilGPT2 on the r/askscience subset of the ELI5 dataset: https://huggingface.co/datasets/eli5

In [6]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")
eli5 = eli5.train_test_split(test_size=0.2)

Found cached dataset eli5 (C:/Users/lkk68/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [7]:
eli5 = eli5.flatten()

In [8]:
eli5["train"][0]

{'q_id': '7i6w9r',
 'title': "What's the purpose of delayed-release Naproxen?",
 'selftext': '',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['dqydn2w', 'dqy120t'],
 'answers.text': ['The above answer is not truly correct, and more so described extended release tablets. Delayed release tablets are similar to instant release tablets, except they have an enteric coating which delays drug release in the stomach. The enteric coating has several uses. The primary use is to avoid toxicity to the stomach lining. Naproxen is an NSAID that can lead to ulcers (damage to stomach lining).. The chemistry of the enteric coating resists the stomach acid, but once it arrives in the small intestine the drug releases and is absorbed. \n\nIf a tablet is ingested without this enteric coating, the release is not delayed and drug dissolution begins in the stomach. This is not an issue for most people, but those with ulcers, gastrointestinal issues, or those take a lot of NSAIDs will probab

In [21]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

In [25]:
tokenizer.pad_token = tokenizer.eos_token

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2") #different

In [22]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        return self.tokenizer(
            [" ".join(x) for x in examples["answers.text"]],
            padding="max_length",
            truncation=True,
        )

In [23]:
tokenizer_wrapper = TokenizerWrapper(tokenizer)

In [26]:
tokenized_dataset = eli5.map(tokenizer_wrapper.tokenize_function, batched=True, num_proc=3, remove_columns=eli5["train"].column_names)

Map (num_proc=3):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [27]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    #print(examples.keys())
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    #print('total_length:', total_length)
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [28]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [29]:
#use the same processed dataset used in MASKED LM
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 8000
    })
})

In [30]:
def addlabels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

In [31]:
lm_datasetlabels = lm_dataset.map(addlabels)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [32]:
lm_datasetlabels

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
})

Use the end-of-sequence token as the padding token and set mlm=False. This will use the inputs as labels shifted to the right by one element:

In [33]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [42]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model_gpt2 = AutoModelForCausalLM.from_pretrained("distilgpt2")

In [43]:
training_args = TrainingArguments(
    output_dir="./output/my_awesome_eli5_clm-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    num_train_epochs=3
)

trainer = Trainer(
    model=model_gpt2,
    args=training_args,
    train_dataset=lm_datasetlabels["train"],
    eval_dataset=lm_datasetlabels["test"],
    data_collator=data_collator,
)

In [44]:

trainer.train()



Epoch,Training Loss,Validation Loss
1,3.8081,3.726445
2,3.6842,3.708932
3,3.6509,3.707046


TrainOutput(global_step=12000, training_loss=3.72333686319987, metrics={'train_runtime': 746.6018, 'train_samples_per_second': 128.583, 'train_steps_per_second': 16.073, 'total_flos': 3135561007104000.0, 'train_loss': 3.72333686319987, 'epoch': 3.0})

In [45]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 40.73


trainer.push_to_hub()

In [46]:
prompt = "Somatic hypermutation allows the immune system to"

In [53]:
from transformers import pipeline

generator = pipeline("text-generation", model="distilgpt2")
generator(prompt)

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Somatic hypermutation allows the immune system to distinguish different pathogens such as leukaemia from leukaemia in humans. However, we did not investigate any of the possible pathogenic diseases in the immune system because of these issues.\n\n\n'}]

In [47]:
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [48]:
inputs=inputs.to('cuda')

In [49]:
inputs.device

device(type='cuda', index=0)

In [50]:
model_gpt2.device

device(type='cuda', index=0)

In [51]:
outputs = model_gpt2.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [52]:
outputs

tensor([[   50, 13730,  8718,    76,  7094,  3578,   262, 10900,  1080,   284,
           779,   606,   284, 41229,   572, 38366,   422,  4369,    13,   632,
           635,  1724,   326,   611,   345,   804,   379,   262, 35757,  2974,
           287,   262,  3632,    11,   345,   460,   766,   326,   530,   318,
          1682,  1016,   284,   307,  4047, 18290,   284,   477,  6982,   286,
         38366,    13,   198,   198,  6943, 20547,  4327,   284,   307,   366,
         38345,     1,   290,  4143,   691,  7580,  1728,  3354,   286,   262,
          1692,  1767,    13,   632,   338,   407,   281,  3489,  3572,    11,
           780,   981,   345,   460,   787,   257,  1256,   286, 30869,   416,
          2045,   329,   606,    11,   340,  2331,  5340,   284,   466,  2279,
           290,   484,  1183,  4143,   307,   517, 17769,   621,   262, 20547]],
       device='cuda:0')

In [53]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Somatic hypermutation allows the immune system to use them to fend off pathogens from disease. It also means that if you look at the antibody levels in the brain, you can see that one is actually going to be highly resistant to all kinds of pathogens.\n\nMost viruses tend to be "immune" and generally only infect certain parts of the human body. It\'s not an obvious choice, because while you can make a lot of antibodies by looking for them, it seems impossible to do everything and they\'ll generally be more lethal than the viruses']