In [None]:
!pip install accelerate --upgrade



In [None]:
!pip install transformers[torch]



In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import numpy as np
import torch


In [4]:
output_dir = '/content/drive/MyDrive/FoodBuddy'

def calculate_perplexity(logits):
    return np.exp(logits.mean())

def calculate_burstiness(text):
    words = text.split()
    word_count = len(words)
    unique_words = len(set(words))
    burstiness = unique_words / word_count
    return burstiness

def fine_tune_gpt2(model_name, train_file, output_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=5,
        save_steps=10000)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()

    # Evaluate perplexity
    eval_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)

    eval_dataloader = trainer.get_eval_dataloader(eval_dataset)

    model.eval()
    perplexity = []
    for batch in eval_dataloader:
        inputs, labels = batch["input_ids"], batch["labels"]
        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            logits = outputs.logits
            perplexity.append(calculate_perplexity(logits))

    avg_perplexity = np.mean(perplexity)
    print(f"Average Perplexity: {avg_perplexity}")

    # Generate text for burstiness calculation
    generated_text = model.generate(
        max_length=1000,  # Adjust length as needed
        temperature=0.7,  # Adjust temperature for diversity
        top_k=50,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )[0]

    generated_text = tokenizer.decode(generated_text, skip_special_tokens=True)
    burstiness = calculate_burstiness(generated_text)
    print(f"Burstiness: {burstiness}")

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)



In [5]:
text_file='/content/drive/MyDrive/FoodBuddy/unsupervised_train.txt'
fine_tune_gpt2("mbien/recipenlg",text_file, output_dir)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/665M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (168167 > 1024). Running this sequence through the model will result in indexing errors


ImportError: ignored

In [2]:
from pathlib import Path
import sys

from google.colab import drive
drive.mount('/content/drive')

base_folder = Path('/content/drive/MyDrive/')
data_folder = Path('/content')

!pip install pytorch-lightning==2.0.9 -qq
!pip install fastai -U -qq
!pip install wandb -U -qq

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m727.7/727.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.6/248.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h