In [None]:
!pip install transformers datasets accelerate torch

In [None]:
!pip install wandb -q

In [None]:
import wandb
wandb.login(key="***")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

# Select model and tokenizer
model_name = "EleutherAI/gpt-neo-125M"  # Example with GPT-Neo
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# from google.colab import drive
# drive.mount('/content/drive')

# # Save model to Google Drive
# model.save_pretrained("/content/drive/MyDrive/fine_tuned_model")


# Load dataset (Example: Hugging Face dataset)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", cache_dir="./data")


In [None]:
# from google.colab import userdata
# s=userdata.get('HF_TOKEN')
# print(s)

In [None]:
# dataset = load_dataset("json", data_files={"train": "train.json", "test": "test.json"})


In [None]:
def tokenize_function(examples):
    tokenizer.padding_side = "left"  # Useful for causal models
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask"])


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./fine_tuned_model_local",
    eval_strategy="epoch",
    bf16=True,  # Enables mixed precision
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500
)


In [None]:
# wandb.watch(model, log="all", log_freq=100, log_graph=False)
import warnings

warnings.filterwarnings("ignore") 

In [None]:
from transformers import Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


In [None]:
results = trainer.evaluate()
print(results)


In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="./fine_tuned_model_local/checkpoint-36718")
output = generator("Spring makes building web applications fast and hassle-free. By removing much", max_length=500)
print(output)
