In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [3]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from transformers import TextDataset, DataCollatorForLanguageModeling

from transformers import Trainer, TrainingArguments

In [31]:
df = pd.read_csv(f'/content/drive/MyDrive/ML/final_project/LLM_prompt/ytx_llama_test.csv').head(5000)
df

Unnamed: 0,en,fr
0,procedures in order to properly use the acquis...,pour utiliser correctement la carte d achat le...
1,the cardholder must keep all related documents...,le detenteur de la carte doit conserver tous l...
2,advising the vendor when placing an order that...,avertir le vendeur lorsqu il passe une command...
3,in order to verify the monthly acquisition car...,pour verifier le releve mensuel de la carte d ...
4,recording and assigning a control number for e...,le detenteur de la carte doit enregistrer chaq...
...,...,...
4995,the military police complaints commission mpcc...,la commission d examen des plaintes concernant...
4996,as of may the mpcc had employees appointed und...,en mai la cppm comptait fonctionnaires nommes ...
4997,the report outlined weaknesses in the human re...,il soulignait des faiblesses chez le fournisse...
4998,top of page objectives and scope of the follow...,top of page objectifs et etendue de la verific...


In [32]:
with open('/content/drive/MyDrive/ML/final_project/LLM_prompt/Articles.txt', 'w') as f:
  for idx, item in df.iterrows():
    en = item["en"]
    fr = item['fr']
    line = f'<startoftext>(english):{en}\n(french):{fr}<endoftext> \n\n'
    f.write(line)



In [33]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):

  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [36]:
# you need to set parameters
train_file_path = '/content/drive/MyDrive/ML/final_project/LLM_prompt/Articles.txt'
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/ML/final_project/finetune_gpt'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 10.0
save_steps = 500

In [37]:
# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss


In [24]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [38]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = output_dir
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [42]:
max_len = 60
sequence = "<startoftext>(english):the oil price in france and england \n(french):" # oil price
generate_text(sequence, max_len) # oil price for July June which had been low at as low as was originally stated Prices have since resumed

<startoftext>(english):the oil price in france and england 
(french):citation article l un europ de la carte des services depose de l est de visiblue et de la carte dans le renseignement egalement ne
