In [12]:
# # set up hub
# from huggingface_hub import notebook_login

# notebook_login()

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset('ag_news')
raw_datasets

Downloading readme: 100%|██████████| 8.07k/8.07k [00:00<00:00, 22.9MB/s]
Downloading data: 100%|██████████| 18.6M/18.6M [00:00<00:00, 74.5MB/s]
Downloading data: 100%|██████████| 1.23M/1.23M [00:00<00:00, 24.3MB/s]
Generating train split: 100%|██████████| 120000/120000 [00:00<00:00, 447115.81 examples/s]
Generating test split: 100%|██████████| 7600/7600 [00:00<00:00, 540925.00 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [3]:
filtered_datasets = raw_datasets.filter(lambda example: example['label'] == 2)
filtered_datasets = filtered_datasets.remove_columns('label')

Filter: 100%|██████████| 120000/120000 [00:00<00:00, 296531.28 examples/s]
Filter: 100%|██████████| 7600/7600 [00:00<00:00, 276561.11 examples/s]


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = (tokenizer.eos_token)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

In [5]:
def tokenize_function(batch):
    return tokenizer(batch['text'], truncation=True)

tokenized_datasets = filtered_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

tokenized_datasets

Map: 100%|██████████| 30000/30000 [00:03<00:00, 7955.06 examples/s] 
Map: 100%|██████████| 1900/1900 [00:00<00:00, 9559.11 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1900
    })
})

In [6]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [7]:
samples = [tokenized_datasets['train'][i] for i in range(3)]

for sample in samples:
    print(f"input_ids shape: {len(sample['input_ids'])}")

input_ids shape: 37
input_ids shape: 55
input_ids shape: 51


In [8]:
out = data_collator(samples)
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([3, 55])
attention_mask shape: torch.Size([3, 55])
labels shape: torch.Size([3, 55])


In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    "sft_cml4",
    push_to_hub=True,
    per_device_train_batch_size=3,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=200,
    fp16=True,
)

In [13]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"].select(range(5000)),
    eval_dataset=tokenized_datasets["test"],
)

In [14]:
import gc
torch.cuda.empty_cache()
gc.collect()

2777

In [15]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpwauyo[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
200,4.1822,3.98803
400,3.9009,3.92478
600,3.7846,3.932813
800,3.7095,3.839306
1000,3.5043,3.813025
1200,3.4826,3.760766
1400,3.3511,3.699686
1600,3.3243,3.634866
1800,2.5235,3.782559
2000,2.0758,3.762716


TrainOutput(global_step=3334, training_loss=2.821503454436066, metrics={'train_runtime': 2213.6817, 'train_samples_per_second': 4.517, 'train_steps_per_second': 1.506, 'total_flos': 366881060736000.0, 'train_loss': 2.821503454436066, 'epoch': 2.0})

In [16]:
trainer.push_to_hub()

model.safetensors: 100%|██████████| 498M/498M [00:09<00:00, 53.2MB/s] 


CommitInfo(commit_url='https://huggingface.co/pbwauyo/sft_cml4/commit/1c770e93233ecd057902367244166d087228d79b', commit_message='End of training', commit_description='', oid='1c770e93233ecd057902367244166d087228d79b', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
from transformers import pipeline

pipe = pipeline('text-generation', model='pbwauyo/sft_cml4', device=device)
pipe.tokenizer.pad_token_id = 50256

print(pipe("Q1", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"], '\n')
print(pipe("Wall", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"], '\n')
print(pipe("Google", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"], '\n')

Q1 Profit Narrows, Outlook Trails Estimates  CHICAGO (Reuters) - Q2 profit grew more than expected, driven by  stronger earnings in technology and a  narrower-than-expected reading in gasoline prices, the University of 

Wall Street sees rise in third quarter Profit London lifted by a rise in US gasoline prices, and Wall Street sees a rebound.   NEW YORK (Reuters) - Investors will watch for news from  Russia, Iraq and Wall Street on Thursday, after 

Google IPO begins public Google surged today on the eve of the most anticipated IPO in tech history. The stock #39;s deal with Google Inc. will raise \$1.67 billion, or nearly \$1.67 billion, over its 

