In [2]:
# ✨✨ Imports & Setup ✨✨
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
import sentencepiece as spm
import os



# 🌸💫 Data Generation for Magical Math Stories 💫🌸
import random

def generate_math_stories(num_samples=100000):
    templates = [
        "Once upon a time, there were {number1} apples. {character} ate {number2} apples. How many are left?",
        "In a forest, {number1} bunnies met with {number2} squirrels. How many animals are there in total?",
        "{character} had {number1} candies. After sharing with friends, they had {number2} candies left. How many did they give away?",
    ]

    characters = ["Alice", "Bob", "Charlie", "Daisy"]
    stories = []

    for _ in range(num_samples):
        template = random.choice(templates)
        char = random.choice(characters)
        num1, num2 = random.randint(1, 20), random.randint(1, 20)
        story = template.format(character=char, number1=num1, number2=num2)
        stories.append(story)

    return stories

stories = generate_math_stories()
# 💖 Train SentencePiece Tokenizer 💖
# Save stories to a temporary file to train SentencePiece
with open("temp_stories.txt", "w") as f:
    for story in stories:
        f.write(story + "\n")



  from .autonotebook import tqdm as notebook_tqdm
2023-08-14 18:25:46.277511: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:

# Train SentencePiece model
spm.SentencePieceTrainer.Train('--input=temp_stories.txt --model_prefix=math_stories --vocab_size=86')


# Load trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.Load("math_stories.model")



sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=temp_stories.txt --model_prefix=math_stories --vocab_size=86
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: temp_stories.txt
  input_format: 
  model_prefix: math_stories
  model_type: UNIGRAM
  vocab_size: 86
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piec

True

In [4]:
# 🎀🌼 Dataset Class for our Math Tales 🌼🎀
class MathStoriesDataset(Dataset):
    def __init__(self, stories, sp, max_length=100):  # max_length is a hyperparameter, adjust as needed
        self.stories = stories
        self.sp = sp
        self.max_length = max_length

    def __len__(self):
        return len(self.stories)
    
    def __getitem__(self, index):
        story = self.stories[index]
        ids = sp.EncodeAsIds(story)
        ids = ids + [0] * (self.max_length - len(ids))  # Padding
        attention_mask = [1 if i != 0 else 0 for i in ids]
        return {'input_ids': torch.tensor(ids), 'attention_mask': torch.tensor(attention_mask)}

# 🍰 Creating the dataset & dataloaders 🍰
dataset = MathStoriesDataset(stories, sp)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 🌟 Configuring our Shiny GPT Model 🌟
configuration = GPT2Config.from_pretrained('gpt2-medium', output_hidden_states=False)
model = GPT2LMHeadModel(configuration)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# 🍵 Optimizer & Scheduler Setup 🍵
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)  # 3 epochs

# 🌼 Training & Validation Functions 🌼
def train(model, data_loader, optimizer, scheduler, device):
    model = model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def validate(model, data_loader, device):
    model = model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs[0]
            total_loss += loss.item()
    return total_loss / len(data_loader)

# 🌸 Main Training Loop 🌸
EPOCHS = 1
device = 'cuda' if torch.cuda.is_available() else 'cpu'
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_loss = train(model, train_loader, optimizer, scheduler, device)
    print(f'Training Loss: {train_loss:.4f}')
    val_loss = validate(model, val_loader, device)
    print(f'Validation Loss: {val_loss:.4f}')

# 🎀 Saving our trained model 🎀
torch.save(model.state_dict(), 'mini_gpt_math_stories.pth')

Downloading (…)lve/main/config.json: 100%|██████████| 718/718 [00:00<00:00, 4.48MB/s]


Epoch 1/1
----------


KeyboardInterrupt: 