In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer

# Configuration
MODEL_NAME = "gpt2"  # Pretrained model
#MODEL_NAME = './decBERTa_instruction_model'  # Pretrained model
DATASET_NAME = "iamtarun/python_code_instructions_18k_alpaca"
OUTPUT_DIR = "./gpt2_instruction_model"
EPOCHS = 10
BATCH_SIZE = 256
LEARNING_RATE = 5e-3
MAX_LEN = 512  # Maximum sequence length
#MAX_LEN = 148  # Maximum sequence length

# Load Dataset
def load_data():
    dataset = load_dataset(DATASET_NAME)
    return dataset["train"].train_test_split(test_size=0.1)

def preprocess_data(examples, tokenizer):
    # Combine 'instruction' and 'input' for each example in the batch
    inputs = [instruction + "\n" + inp for instruction, inp in zip(examples["instruction"], examples["input"])]
    targets = examples["output"]
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=MAX_LEN, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=MAX_LEN, truncation=True, padding="max_length")["input_ids"]
    
    model_inputs["labels"] = labels
    return model_inputs

In [None]:
def main():
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

    # Set the pad token if it's not already defined
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load and preprocess data
    datasets = load_data()
    tokenized_datasets = datasets.map(preprocess_data, batched=True, fn_kwargs={"tokenizer": tokenizer})

    # Set training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        save_steps=1_000,
        save_total_limit=2,
        evaluation_strategy="epoch",
        learning_rate=LEARNING_RATE,
        logging_dir="./logs",
        logging_steps=500,
        fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU is available
        push_to_hub=False,
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Save the model
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

if __name__ == "__main__":
    main()


In [None]:
def generate_code_instruction(prompt, max_length=MAX_LEN):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR, padding_side="left")
    tokenizer.padding_side = "left"
    model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR, is_decoder=True)

    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True)

    # Generate the output
    outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode the generated output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example usage
prompt = "Write a Python function to calculate one random number."
generated_instruction = generate_code_instruction(prompt)
print(generated_instruction)