In [1]:
%pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10.10-cp310-cp310-win_amd64.whl.metadata (7.8 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Colle


[notice] A new release of pip is available: 24.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling, AutoTokenizer
from datasets import load_dataset

checkpoint = "gpt2"

model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
datacollator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [19]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [34]:
raw_dataset = load_dataset("json", data_files='./stories.json')

Generating train split: 1000 examples [00:00, 8207.08 examples/s]


In [35]:
dataset = raw_dataset['train'].train_test_split(test_size=0.2, train_size=0.8, seed=42)

dataset['train'][0]['instruction']

'Please create a Cyberpunk story for me'

In [36]:
dataset['train'][0]

{'instruction': 'Please create a Cyberpunk story for me',
 'output': "Sure! I will create a Cyberpunk story for you.\nTitle: Neon Eclipse\nIn the sprawling megacity of Neo-Tokyo, the sun dipped behind the skyscrapers, casting long shadows on the neon-lit streets. A lone figure, clad in a tattered trench coat, stepped out of the shadows. He was known as the Shadow Hacker, a legend in the dark corners of the city's digital underworld.\n\n    His mission was to infiltrate the heavily guarded mainframe of the ruling Corporate Conglomerate. The stakes were high; millions of lives hung in the balance.\n\n    He bypassed the security protocols with ease, his fingers dancing across the keyboard. The system beeped, alerting the authorities.\n\n    The Shadow Hacker smiled, knowing he had bought the city's resistance a few more hours. As the police swarmed the building, he vanished into the night, disappearing into the labyrinthine city.\n\n    The neon lights flickered overhead, casting eerie p

In [38]:
def tokenizeData(data):
    return tokenizer(data['instruction'], data['output'], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenizeData, batched=True)

Map: 100%|██████████| 800/800 [00:00<00:00, 948.49 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1079.51 examples/s]


In [39]:
tokenized_dataset = tokenized_dataset.remove_columns(['instruction', 'output'])

In [40]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [41]:
from transformers import Trainer, TrainingArguments

trainer_args = TrainingArguments('story_model', eval_strategy='steps', auto_find_batch_size=True)

trainer = Trainer(
    model=model,
    args=trainer_args,
    data_collator=datacollator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
)

In [43]:
trainer.train()

  0%|          | 0/300 [3:28:32<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 16.91 GB, other allocations: 1012.58 MB, max allowed: 18.13 GB). Tried to allocate 384.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [23]:
model = AutoModelForCausalLM.from_pretrained('./story_model/checkpoint-300')

In [27]:
from transformers import pipeline, set_seed

generator = pipeline('text-generation', model=model, tokenizer='./story_model/checkpoint-300', device='mps')
set_seed(42)
generator('Create a fantasy story', max_length=100, num_return_sequences=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'Create a fantasy story""""" Lighting"""" Ethan\'sotype""" Observer""" psyche"""twitch" 60"""""""""""" Industrial"""" Industrial"""" Triangle"""" Bob\'s concept""" nightmare""" Commission"""ote annex"" 560 293"""""""""""" 142essorsAnim""" orderly"""'}]