### Reference

[finetune gpt2 for beginners](https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners/notebook)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [None]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [None]:
df = pd.read_csv(f'/content/drive/MyDrive/ML/final_project/Transformer_from_scratch/cleaned200k.csv')
df

Unnamed: 0,en,fr
0,changing lives changing society how it works t...,il a transforme notre vie il a transforme la s...
1,site map,plan du site
2,feedback,retroaction
3,credits,credits
4,francais,english
...,...,...
159064,june press conference in ottawa to release rep...,le juin conference de presse a ottawa pour le ...
159065,quality end of life care,des soins de fin de vie de qualite
159066,a progress report,rapport d etape
159067,june guest speaker at annual meeting children ...,le juin allocution a l assemblee generale annu...


### reformat the document

In [None]:
with open('/content/drive/MyDrive/ML/final_project/LLM_prompt/Articles.txt', 'w') as f:
  for idx, item in df.iterrows():
    en = item["en"]
    fr = item['fr']
    line = f'<startoftext>(english):{en}\n(french):{fr}<endoftext> \n\n'
    f.write(line)



In [None]:

def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  # load pretrained tokenizer
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  # load dataset
  train_dataset = load_dataset(train_file_path, tokenizer)
  # load collator
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
  # load pretrained model
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)
  # specify training arguments
  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )
  # create a trainer
  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
  # start training process
  trainer.train()
  # save the trained model
  trainer.save_model()

In [None]:
#set hyperparameters for training
train_file_path = '/content/drive/MyDrive/ML/final_project/LLM_prompt/Articles.txt'
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/ML/final_project/finetune_gpt'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 10.0
save_steps = 500

In [None]:

train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Step,Training Loss


In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

# define a function that
def generate_translation(sequence, max_length):
    model_path = output_dir
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    tokens = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        tokens,
        temperature = 0.2,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=10,
        top_p=0.95,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [None]:
max_len = 50
sequence = "<startoftext>(english):the oil price in france and england \n(french):" # oil price


In [None]:
import time
start = time.time()
s = generate_translation(sequence, max_len)
end = time.time()
print('one sentence',end-start)

one sentence 6.372993469238281


In [None]:
s

'<startoftext>(english):the oil price in france and england \n(french):le releveau de la carte de l etre de la carte de l etre de la carte de l etre'

In [None]:
def translate_row(s):
  raw_str = generate_translation(s, max_len)
  trans = raw_str.split('french):')[-1]
  return trans

In [None]:
translate_row(s)

'le releveau de la carte de l etre de la carte de l etre de la carte de l etre de'

In [None]:
test = pd.read_csv('/content/drive/MyDrive/ML/final_project/Transformer_from_scratch/byk_llama_test.csv').tail(500)
test.shape

(500, 2)

In [None]:
import datetime
from tqdm.notebook import tqdm
tqdm.pandas()


start = time.time()
test['trans'] = test['en'].progress_apply(lambda s:translate_row(s))

end = time.time()
print('translating ', len(df) ,' lines take ', str(datetime.timedelta(seconds = end-start)))

  0%|          | 0/500 [00:00<?, ?it/s]



translating  159069  lines take  0:43:20.926341


In [None]:
test

Unnamed: 0,en,fr,trans
48499,in the case of the appellant he found that for...,dans le cas de l appelante il a juge qu il ne ...,leur de la langue de l influence sur leur
48500,he did not proceed further to evaluate any oth...,il n a pas poursuivi l evaluation des autres c...,amendement par le casse de la casse de la cass...
48501,in assessing a mr derousie relied on certain c...,pour son evaluation du critere c m derousie s ...,le rating des mr de la carte etree les mr de l...
48502,he then reviewed the appellant s response and ...,il a ensuite examine la reponse de l appelante...,amendement par leur par leur de la recherche e...
48503,for example there were incomplete sentences in...,par exemple certaines phrases etaient incomple...,le recherche etree les autres autres de la rec...
...,...,...,...
48994,stock assessment of the european lobster homar...,l evaluation des stocks de homards d europe ho...,le livre etreeur de la recher
48995,which assumes that recruitment to the fishery ...,selon laquelle le recrutement pour la peche es...,le recherche et le recherche de la recherche d...
48996,yield curves show a clear maximum with a marke...,les courbes de la production montrent un net m...,le travail avec une scleroderienne de la scler...
48997,error file not found sorry but the file lvtsmp...,erreur fichier introuvable nous sommes desoles...,le temps dans le temps de la modele des modele...


In [None]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
def bleu(truth,pred):
    return sentence_bleu([truth], pred) #, weights=(1, 0, 0, 0)
# Applying it to two columns
test["bleu"] = test.apply(lambda x: bleu(x["fr"], x["trans"]), axis=1)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
test

Unnamed: 0,en,fr,trans,bleu
48499,in the case of the appellant he found that for...,dans le cas de l appelante il a juge qu il ne ...,leur de la langue de l influence sur leur,5.924427e-02
48500,he did not proceed further to evaluate any oth...,il n a pas poursuivi l evaluation des autres c...,amendement par le casse de la casse de la cass...,1.244211e-01
48501,in assessing a mr derousie relied on certain c...,pour son evaluation du critere c m derousie s ...,le rating des mr de la carte etree les mr de l...,6.232779e-02
48502,he then reviewed the appellant s response and ...,il a ensuite examine la reponse de l appelante...,amendement par leur par leur de la recherche e...,5.459207e-02
48503,for example there were incomplete sentences in...,par exemple certaines phrases etaient incomple...,le recherche etree les autres autres de la rec...,1.220257e-01
...,...,...,...,...
48994,stock assessment of the european lobster homar...,l evaluation des stocks de homards d europe ho...,le livre etreeur de la recher,3.589985e-03
48995,which assumes that recruitment to the fishery ...,selon laquelle le recrutement pour la peche es...,le recherche et le recherche de la recherche d...,2.102298e-01
48996,yield curves show a clear maximum with a marke...,les courbes de la production montrent un net m...,le travail avec une scleroderienne de la scler...,1.765612e-01
48997,error file not found sorry but the file lvtsmp...,erreur fichier introuvable nous sommes desoles...,le temps dans le temps de la modele des modele...,1.464821e-01


In [None]:
test.to_csv(f'/content/drive/MyDrive/ML/final_project/finetune_gpt/finetune_gpt_trans.csv', index=False)

In [None]:
test['bleu'].mean()

0.1405983786325191