In [1]:
!pip install --upgrade accelerate
!pip install peft

Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.9.0


In [22]:
# we upgraded `accelerate` just because to import Trainer API
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from glob import glob
from peft import LoraConfig, get_peft_model
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('/kaggle/input/faq-ucf/faqsUcfDataset_2.csv')
val = pd.read_csv('/kaggle/input/faq-ucf/faqsUcfDataset_val.csv')

In [4]:
def preprocess(df):
    df["text"] = df["Q"] + " " + df["A"]
    df = df.drop(['Q', 'A'], axis=1)
    return df

In [5]:
train = preprocess(train)
val = preprocess(val)

In [6]:
# TODO : choose model name
MODEL_NAME = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = "<pad>"

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=128, truncation=True, padding="max_length")

In [8]:
tokenized_train = train.apply(tokenize_function, axis=1)
tokenized_val = val.apply(tokenize_function, axis=1)

In [9]:
def copy_input_ids(example):
    example["labels"] = example["input_ids"].copy()
    return example

In [10]:
tokenized_train = tokenized_train.apply(copy_input_ids)
tokenized_val = tokenized_val.apply(copy_input_ids)

In [11]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [12]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # task_type, token classification (TaskType.CAUSAL_LM)
    inference_mode=False,
    r=8,                           # r, the dimension of the low-rank matrices
    lora_alpha=16,                 # lora_alpha, scaling factor for the weight matrices
    lora_dropout=0.3,              # lora_dropout, dropout probability of the LoRA layers
    fan_in_fan_out=True,
    bias="lora_only"               # bias, set to only lora layers to train
    
)

In [13]:
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.23643136409814364


In [14]:
training_args = TrainingArguments(
    "gpt2-on-ucf-faq",
    
    num_train_epochs=300,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    dataloader_num_workers=2,

    evaluation_strategy = "steps",
    logging_strategy="steps",
    save_strategy="steps",
    eval_steps=150,
    logging_steps=150,
    save_steps=150,

    learning_rate=1e-3,
    weight_decay=0.01,
    save_total_limit=10,
    report_to='none',

    load_best_model_at_end=True,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

In [16]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [17]:
train_output = trainer.train()
print(train_output)

Step,Training Loss,Validation Loss
150,2.1411,1.593605
300,1.349,1.148883
450,1.0175,0.958466
600,0.8261,0.855972
750,0.7116,0.807817
900,0.658,0.793972


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=900, training_loss=1.1172228410508898, metrics={'train_runtime': 321.616, 'train_samples_per_second': 68.094, 'train_steps_per_second': 2.798, 'total_flos': 1435534059110400.0, 'train_loss': 1.1172228410508898, 'epoch': 300.0})


In [31]:
# TODO input prompt
prompt = "How long does it take for an electronic transcript to arrive at UCF?"
encoded_prompt = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt = encoded_prompt.to(trainer.model.device)

# prediction
output_sequences = trainer.model.generate(
    input_ids=encoded_prompt,
    max_length=128,
    min_length=1,
    temperature=1.,
    top_p=1.,
    do_sample=True,
    num_return_sequences=1,
    pad_token_id=tokenizer.pad_token_id,
)

generated_sequences = []

# decode prediction
for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=False)
    generated_sequences.append(text.strip())


In [32]:
generated_sequences

['How long does it take for an electronic transcript to arrive at UCF? Electronic transcripts typically arrive at the university within six to eight business days after being sent from the institution of origin. Non-referrals to UCF for assistance happen at approximately five business days following receipt of the final document. Once served, documents begin arriving at the institution of origin within five to ten business days after being sent. Document processing time varies by time of year: Non-referrals to a Florida State University system typically 2-4 business hours prior to application completion stage (around application completion stage and the start of every semester).<|endoftext|>']

In [37]:
while True:
    prompt = input("User > ")
    if prompt:
        if prompt == "bye":
            break
        encoded_prompt = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
        encoded_prompt = encoded_prompt.to(trainer.model.device)

        # prediction
        output_sequences = trainer.model.generate(
            input_ids=encoded_prompt,
            max_length=128,
            min_length=1,
            temperature=1.,
            top_p=1.,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
        )

        generated_sequences = []

        # decode prediction
        for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
            generated_sequence = generated_sequence.tolist()
            text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
            generated_sequences.append(text.strip().split("?")[1])
        print("Assistant > ", generated_sequences[0])

User >  When should I apply to UCF?


Assistant >   UCF recommend freshman applicants with a West Coast college degree apply early (between May and September of your senior year). Freshmen should apply early in November (or whenever possible during the term of your senior year), as described in the guidelines below. In some instances, the admission date may be different than the date of the senior year. If you delay admission, you may still be referred to the Joseph T. Davis Institute for Higher Education for an official evaluation. This process is available on the UCF website and is described in the Instructions for Transfer Applicants under Advanced Admissions. In some cases,


User >  Does UCF have on-campus housing?


Assistant >   Yes, All-campus housing is available for admission to UCF under the UCF Living Room and Living Room Unit. Beginning in spring, the Living Room Unit will be closed for student housing only once in the semester following summer term. In the summer term, all students will be housed in the Living Room Unit. At a minimum, one living relative will need to be available to housing. For detailed information regarding this change, please visit www.ucf.edu/livingroom or use the livingroomunderground to locate roommates.


User >  What is the admission rate of UCF?


Assistant >   UCF accepts applications for full time, for full time employees and for those students who are assigned an academic post-secondary education benefit. Our program is a work-in-progress and requires extensive participation by all students, primarily from outside of high school, through to college or university education evaluation.


User >  bye
