In [None]:
!pip install -U accelerate
!pip install -U transformers

In [None]:
import torch

device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
print(f'Using Device: {device}')

Using Device: cuda


In [None]:
import os
import string
import re
from google.colab import files

data_dir = "/content/drive/MyDrive/Colab Notebooks/The Story Generator/Dataset"
output_file = "all_data.txt"

def is_hidden(filepath):
  return os.path.basename(filepath).startswith('.')

with open(output_file, "w") as outfile:
  for filename in os.listdir(data_dir):
    filepath = os.path.join(data_dir,filename)
    if not is_hidden(filepath):
      with open(filepath,"r",encoding='utf-8-sig') as infile:
        for line in infile:
          if line.strip():
            clean = line.replace("\n", " ").replace('. ', '.\n')
            outfile.write(clean)

files.download('all_data.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import GPT2Tokenizer,GPT2LMHeadModel,TrainingArguments,Trainer,DataCollatorWithPadding

from torch.utils.data import Dataset

tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Generator/gpt2default/tokenizer")
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Generator/gpt2default/model")

if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.pad_token_id = tokenizer.eos_token_id


In [None]:
class CustomDataset(Dataset):
  def __init__(self, tokenizer, file_path, block_size):
    self.tokenizer = tokenizer
    with open(file_path,"r") as f:
      self.text = f.read().splitlines()

  def __len__(self):
    return len(self.text)

  def __getitem__(self,idx):
    tokenized_inputs = self.tokenizer(
        self.text[idx],
        truncation = True,
        max_length = 128,
        padding = "max_length",
        return_tensors = "pt"
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"]
    return tokenized_inputs

In [None]:
data = CustomDataset(tokenizer, "all_data.txt", 128)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    per_device_train_batch_size = 50,
    num_train_epochs = 4,
    learning_rate = 1e-4,
    output_dir = './trained',
    load_best_model_at_end = False,
    evaluation_strategy = "no",
    remove_unused_columns = False,
    push_to_hub = False,
    save_total_limit = 5,
)

In [None]:
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset = data,
    eval_dataset = None,
    data_collator = data_collator,
)

trainer.train()

Step,Training Loss
500,0.6898
1000,0.607
1500,0.5488
2000,0.4902
2500,0.4478
3000,0.4115
3500,0.3424
4000,0.3227
4500,0.3065
5000,0.2871


TrainOutput(global_step=12252, training_loss=0.3047974041930375, metrics={'train_runtime': 12547.4402, 'train_samples_per_second': 48.809, 'train_steps_per_second': 0.976, 'total_flos': 1.4219150593779302e+17, 'train_loss': 0.3047974041930375, 'epoch': 4.0})

In [None]:
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Generator/finetuneddataeq/model")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Generator/finetuneddataeq/train")

('/content/drive/MyDrive/Colab Notebooks/Story Generator/finetuneddataeq/train/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Story Generator/finetuneddataeq/train/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Story Generator/finetuneddataeq/train/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/Story Generator/finetuneddataeq/train/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/Story Generator/finetuneddataeq/train/added_tokens.json')

In [None]:
!zip -r trained.zip /content/trained

from google.colab import files
files.download('data.zip')
# files.download('log.zip')
files.download('trained.zip')



zip error: Nothing to do! (try: zip -r data.zip . -i /content/data)
  adding: content/trained/ (stored 0%)
  adding: content/trained/checkpoint-10500/ (stored 0%)
  adding: content/trained/checkpoint-10500/generation_config.json (deflated 24%)
  adding: content/trained/checkpoint-10500/training_args.bin (deflated 51%)
  adding: content/trained/checkpoint-10500/rng_state.pth (deflated 25%)
  adding: content/trained/checkpoint-10500/scheduler.pt (deflated 56%)
  adding: content/trained/checkpoint-10500/model.safetensors

In [None]:
import torch

device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
print(f'Using Device: {device}')

Using Device: cuda


In [None]:
model.eval()

prompt_text = "Nihara was a textile engineer in Sri Lanka and she loves chocolate"

input_ids = tokenizer(prompt_text, return_tensors = "pt").input_ids
attention_mask = tokenizer(
    prompt_text, return_tensors="pt"
).attention_mask

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

output = model.generate(
    input_ids = input_ids,
    attention_mask = attention_mask,
    pad_token_id = tokenizer.pad_token_id,
    max_length = 1000,
    num_beams = 2,
    min_length = 500,
    temperature = 2.5,
    top_k = 50,
    do_sample = True,
)


generated_text = tokenizer.decode(output[0],skip_special_tokens = True)

print(generated_text)

Nihara was a textile engineer in Sri Lanka and she loves chocolate like a queen.” Nissen was a celebrated chef in Spain, known for his delicate creations that were never frozen and couldn’t develop a brown or red tinge once opened.” The three friends stared at each other, mouths open, but their faces still expressed genuine excitement to hear so inspiring a speaker deliver these words in their mother tongue.” And there they were, back on shore where their exploits had taken them and where they were truly wanted—UNLIMITED!” And there was no mistaking the sincerity of these three men’s sentiments! Indeed, the Nautilus hadn’t left the narrow, winding channel through which it had dived on the morning of July 2 while surveying its every move, an experience that they, and not the man, must have regretted at the time they were undertaking! And while these three gentlemen did indeed regret leaving their ocean companion stranded, it was undeniable that they wished no more than to resume their u

In [None]:
model.eval()

prompt_text = '''Nehara was so beautiful it defied explanation. I stood dumbstruck at her sheer presence and so'''
input_ids = tokenizer(prompt_text, return_tensors = "pt").input_ids
attention_mask = tokenizer(
    prompt_text, return_tensors="pt"
).attention_mask

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

output = model.generate(
    input_ids = input_ids,
    attention_mask = attention_mask,
    pad_token_id = tokenizer.pad_token_id,
    max_length = 1000,
    num_beams = 20,
    min_length = 500,
    temperature = 2.5,
    top_k = 50,
    do_sample = True,
)


generated_text = tokenizer.decode(output[0],skip_special_tokens = True)

print(generated_text)

Nehara was so beautiful it defied explanation. I stood dumbstruck at her sheer presence and so fascinated by her beauty that I forgot my fear of heights and crept up a little to look down from the terrace I had occupied and marvel at the stillness and beauty of the stillness, and all around me the stars were going to and fro, going from left to right, and from right to left, and twinkling in the west and east, and then back again, and so on, all the way back to the house on the hill, and then I would marvel at the stillness and beauty of the place, until I was so dazzled that I forgot my fear of heights and crept down into the garden by the side of the river to look at the heather, and there I beheld the stillness and beauty of the place, until it was too late and I had forgotten my fear of heights, and stood there looking at it all day, and all the next morning, till it was too late, and I had forgotten my fear of trees and bushes, and crept back to the house on the hill, and marveled