In [None]:
!pip install -q transformers

In [None]:
from transformers import AutoModelForCausalLM, AdamW
import torch
from tqdm import tqdm  # Import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from IPython.display import FileLink
import pandas as pd
from transformers import set_seed

In [None]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id).to(torch_device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### ***Greedy***

In [None]:
# encode context the generation is conditioned on
model_inputs = tokenizer('I was taking a bath then', return_tensors='pt').to(torch_device)

# generate 40 new tokens
greedy_output = model.generate(**model_inputs, max_new_tokens=40)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I was taking a bath then I heard a noise. I looked up and saw a man in a white shirt and a black shirt. I looked up and saw a man in a white shirt and a black shirt. I looked up


### ***Top K***

In [None]:
model_inputs = tokenizer('I went to school', return_tensors='pt').to(torch_device)
sample_outputs = model.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
0: I went to school, and went back to work and stayed here to see if there were anything we could do. But my father said, 'Look, you can't. You're going to be living in the suburbs
1: I went to school," he says, laughing nervously.

He's had a little break in the last year, and as a young man who does not have any family on him, Mr Thompson decided to change course
2: I went to school, had a really good first college in town, got my degree, got up in the morning and then went out and bought myself a coffee," said his mom. "I'm kind of a coffee


### ***Training***

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
from transformers import GPT2Tokenizer
import pandas as pd

# # Load the dataset
dataset = pd.read_json("hf://datasets/MuskumPillerum/General-Knowledge/output.json")
dataset['text'] = dataset.apply(lambda row: f"Q: {row['Question']}  A: {row['Answer']}", axis=1)

# Save the preprocessed data to a text file
preprocessed_file_name = 'preprocessed_data.txt'
dataset['text'].to_csv(preprocessed_file_name, index=False, header=False)

print("Tokenized data has been saved.")


Tokenized data has been saved.


In [None]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [None]:
# you need to set parameters
train_file_path = "/content/preprocessed_data.txt"
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/result'
overwrite_output_dir = False
per_device_train_batch_size = 128
num_train_epochs = 10
save_steps = 1000

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



### ***Prediction***

In [None]:
model_dir = '/content/drive/MyDrive/result'
# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_dir).to(torch_device)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [None]:
# encode context the generation is conditioned on
model_inputs = tokenizer('Person A: Hi, how was your day?', return_tensors='pt').to(torch_device)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    **model_inputs,
    max_new_tokens=150,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
0: Person A: Hi, how was your day?
Q: hi, how was your day? || A: i got really good at school.
Q: i got really good at school. || A: i'm not good at math.
Q: i'm not good at math. || A: well, your grades have gone up.
Q: well, your grades have gone up. || A: you should probably do something else. you're such a talented student.
Q: what did you do at school? || A: i went to chinese restaurant.
Q: i went to chinese restaurant. || A: did you like it?
Q: did you like it? || A: i loved it! how about yourself?
Q: i
1: Person A: Hi, how was your day?
Q: hi, how was your day? || A: my best was 100!
"Q: my best was 100! || A: that's amazing, thank you. i really wish i had never met you."
"Q: that's amazing, thank you. i really wish i had never met you. || A: you know me too, i wish i had. sometimes it's fun to meet new people."
Q: you know me too, i wish i had never met you. || A: of cou