In [None]:
# !pip install datasets transformers accelerate

In [None]:
# Mount Google Drive (if your dataset is stored there)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from datasets import load_dataset
from transformers import Trainer, TrainingArguments

# Load tokenizer and set pad_token
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_function(examples):
    # Tokenize the inputs and labels
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)
    # GPT-Neo expects the labels to be the input ids
    # We need to shift the input ids to the right to create the labels
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    # We don't want to compute loss on padding, so we set the labels for padding tokens to -100
    tokenized_inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels] for labels in tokenized_inputs["labels"]
    ]
    return tokenized_inputs


In [None]:
# Load datasets
datasets = load_dataset('csv', data_files={'train': '/content/drive/MyDrive/Milind-Project/train_context.csv',
                                           'validation': '/content/drive/MyDrive/Milind-Project/val_context.csv',
                                           'test': '/content/drive/MyDrive/Milind-Project/test_context.csv'})


In [None]:
# Tokenize all datasets
tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Load model
#model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = GPTNeoForCausalLM.from_pretrained("/content/drive/MyDrive/Milind-Project/resultsv2/checkpoint-50000")

# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Milind-Project/resultsv2",          # The output directory
    num_train_epochs=2,              # Total number of training epochs
    per_device_train_batch_size=1,   # Batch size for training
    per_device_eval_batch_size=1,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='/content/drive/MyDrive/Milind-Project/logsv2',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_steps=10000,
    save_total_limit=3,# Evaluation is done at the end of each epoch
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

# Train the model
trainer.train(resume_from_checkpoint="/content/drive/MyDrive/Milind-Project/resultsv2/checkpoint-50000")
trainer.save_model("/content/drive/MyDrive/Milind-Project/Model-Neo2B")
tokenizer.save_pretrained('/content/drive/MyDrive/Milind-Project/Model-Neo2B')

Map:   0%|          | 0/12262 [00:00<?, ? examples/s]

Map:   0%|          | 0/12262 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Epoch,Training Loss,Validation Loss
2,2.8712,


('/content/drive/MyDrive/Milind-Project/Model-Neo2B/tokenizer_config.json',
 '/content/drive/MyDrive/Milind-Project/Model-Neo2B/special_tokens_map.json',
 '/content/drive/MyDrive/Milind-Project/Model-Neo2B/vocab.json',
 '/content/drive/MyDrive/Milind-Project/Model-Neo2B/merges.txt',
 '/content/drive/MyDrive/Milind-Project/Model-Neo2B/added_tokens.json')

In [None]:
trainer.save_model("/content/Model-Full-Trained")
tokenizer.save_pretrained('/content/Model-Full-Trained')

('/content/Model-Full-Trained/tokenizer_config.json',
 '/content/Model-Full-Trained/special_tokens_map.json',
 '/content/Model-Full-Trained/vocab.json',
 '/content/Model-Full-Trained/merges.txt',
 '/content/Model-Full-Trained/added_tokens.json')