<a href="https://colab.research.google.com/github/mafsi/slm/blob/master/train_notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Cleaning initial folders
---



In [1]:
!rm -r /content/sample_data


Add data folder

In [2]:
! mkdir -p /content/data/

Moving content to data folder

In [3]:
!mv /content/*.md /content/data

Install requirements

In [None]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install gradio

Add script for training the data

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

# Load pre-trained model and tokenizer
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

# Load and tokenize the dataset from the "data" folder
dataset = load_dataset("text", data_files={"train": "data/*.md"})


def tokenize_function(example):
    input_tokens = tokenizer(example["text"], return_tensors="pt",
                             padding="max_length", truncation=True, max_length=128)
    input_tokens["labels"] = input_tokens["input_ids"].detach().clone()
    return input_tokens


tokenized_dataset = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_dataset["train"]

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=100_0,
    save_total_limit=2,
    logging_steps=1000,
    # resume training from the checkpoint saved at step 500
    resume_from_checkpoint="./output/checkpoint-last",
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_model")


Adding script for interacting with the fine-tuned-model

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import gradio as gr

def load_model(model_path: str, model_name: str = "distilgpt2"):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return tokenizer, model

def generate_text(input_text: str, max_length: int = 200):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)
    output = model.generate(
        input_ids,
        max_length=max_length,
        min_length=50,  # Set a minimum length for the generated text
        num_return_sequences=1,
        attention_mask=attention_mask,
        do_sample=True,  # Enable sampling to generate more diverse text
        temperature=0.8,  # Adjust the temperature to control randomness (higher values = more random)
    )
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded_output

# Load the fine-tuned model
model_path = "./fine_tuned_model"
tokenizer, model = load_model(model_path)

# Create a Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.inputs.Textbox(lines=5, label="Input Text"),
        gr.inputs.Number(default=200, label="Maximum Length")
    ],
    outputs="text",
    title="GPT-2 Text Generation",
    description="Enter a prompt and let GPT-2 generate text for you!",
)

# Start the Gradio interface
iface.launch()
