# Fine-tune a smaller GPT model

Train a smaller GPT model to produce summaries like the gpt-4o model.

## Notebook controls

In [1]:
# notebook controls
remove_old_directories = True
train_model = True
evaluate_model = True

## Remove artifacts from previous run

In [2]:
import shutil

# remove all directories from training and testing
if (remove_old_directories):
    shutil.rmtree('./results/gpt', ignore_errors=True)
    shutil.rmtree('./tuned_model', ignore_errors=True)
    shutil.rmtree('./tuned_tokenizer', ignore_errors=True)
    shutil.rmtree('./tmp', ignore_errors=True)

## Select the base model

In [3]:
# model_name = "facebook/opt-350m"
# model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "openai-community/gpt2"
model_name = "EleutherAI/gpt-neo-125m"


## Train/fine-tune the gpt model to produce summaries 

In [4]:
from SharedUtils import trim_to_max_length

def hashtag_format(text, summary):
    prompt = f"### Question: Summarize the following which is surrounded by quotes \"{text}\"\n### Answer:"
    if (len(summary) > 0):
        prompt += f" {summary}"
    return prompt

def format_text(text, summary):
    limited_text = trim_to_max_length(text)
    return hashtag_format(limited_text, summary)

response_template = "### Answer:"
    

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

if (train_model):
    dataset = load_dataset("csv", data_files="./datasets/podcast_with_summary_train.csv", split="train")

    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    def formatting_prompts_func(example):
        output_texts = []
        for i in range(len(example['text_short'])):
            txt = example['text_short'][i]
            text = format_text(txt, example['summary'][i])
            output_texts.append(text)
        
        return output_texts

    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir="./tmp",
        evaluation_strategy="steps",
        eval_steps=500,
        logging_dir='./logs',
        logging_steps=100,
        save_steps=500,
        save_total_limit=2,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        learning_rate=5e-5,
        gradient_accumulation_steps=8
        )

    trainer = SFTTrainer(
        model,
        train_dataset=dataset,
        eval_dataset=dataset,
        args=training_args,
        formatting_func=formatting_prompts_func,
        data_collator=collator,
    )

    trainer.train()

    trainer.save_model("./tuned_model")
    tokenizer.save_pretrained("./tuned_tokenizer")

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 24/24 [00:11<00:00,  2.11it/s]


{'train_runtime': 11.3831, 'train_samples_per_second': 67.205, 'train_steps_per_second': 2.108, 'train_loss': 2.289925734202067, 'epoch': 3.0}


## Test inference using the new fine tuned model

In [6]:
import pandas as pd

df = pd.read_csv("./datasets/podcast_with_summary_test.csv")
first_text = df['text'][0]
expected_summary = df['summary'][0]

prompt = format_text(first_text, "")

print(len(prompt))
print(prompt)
print(expected_summary)

1065
### Question: Summarize the following which is surrounded by quotes "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast. If you enjoy it, subscribe on YouTube, give it five stars on Apple Podcast, support it on Patreon, or simply connect with me on Twitter at Lex Friedman, spelled F R I D M A N. As usual, I'll do one or two minutes of ads now and never any ads in the middle that can break the flow of the conversation. I hope that works for you and doesn't hurt the listening experience. This show is presented by Cash App, the number one finance ap

In [7]:
from SharedUtils import trim_to_last_punctuation

def parse_summary_from_response(response):
    # find the start token
    response_template_trimmed = response_template.strip()
    start_token = response.find(response_template_trimmed)
    if start_token == -1:
        return ""
    # trim from end of the start token to the end of the response
    result = response[start_token + len(response_template_trimmed):]
    result = trim_to_last_punctuation(result)    
    return result

In [8]:
max_new_tokens = 48

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from transformers import pipeline
import time

def create_generator(model_name, tokenizer_name):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenizer.pad_token = tokenizer.eos_token
    return pipeline("text-generation", model=model, tokenizer=tokenizer, device="cuda", truncation=True)

def run_inference(generator, prompt, max_new_tokens, log_results):
    elapsed_time = 0
    start_time = time.time()
    generated_text = generator(prompt, max_new_tokens=max_new_tokens, num_return_sequences=1)
    end_time = time.time()
    elapsed_time = end_time - start_time
    if log_results:
        print(generated_text[0]["generated_text"])
        print("\n")
    return parse_summary_from_response(generated_text[0]["generated_text"]), elapsed_time

In [10]:
generator_finetuned = create_generator("./tuned_model", "./tuned_tokenizer")
summary, elapsed = run_inference(generator_finetuned, prompt, max_new_tokens, True)

print("Summary: " + summary)

### Question: Summarize the following which is surrounded by quotes "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast. If you enjoy it, subscribe on YouTube, give it five stars on Apple Podcast, support it on Patreon, or simply connect with me on Twitter at Lex Friedman, spelled F R I D M A N. As usual, I'll do one or two minutes of ads now and never any ads in the middle that can break the flow of the conversation. I hope that works for you and doesn't hurt the listening experience. This show is presented by Cash App, the number one finance app in 

In [11]:
generator_original = create_generator(model_name, model_name)
summary, elapsed = run_inference(generator_original, prompt, max_new_tokens, True)

print("Summary: " + summary)

### Question: Summarize the following which is surrounded by quotes "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast. If you enjoy it, subscribe on YouTube, give it five stars on Apple Podcast, support it on Patreon, or simply connect with me on Twitter at Lex Friedman, spelled F R I D M A N. As usual, I'll do one or two minutes of ads now and never any ads in the middle that can break the flow of the conversation. I hope that works for you and doesn't hurt the listening experience. This show is presented by Cash App, the number one finance app in 

## Evaluate the results
Compare the fine tuned model against the base model

In [12]:
import os
os.makedirs("./results", exist_ok=True)
# remove dir and all subdirs
shutil.rmtree("./results/gpt", ignore_errors=True)
os.makedirs("./results/gpt", exist_ok=True)

In [13]:
import evaluate
import json
import time

def evaluate_df(df, generator, generator_tuned, max_new_tokens, name):
    if (not evaluate_model):
        return

    os.makedirs(f"./results/gpt/{name}", exist_ok=True)

    total_time_orig = 0
    total_time_tuned = 0
    summaries_orig = []
    summaries_tuned = []
    reference_summaries = []
    
    for i in range(len(df)):
        text = df['text_short'][i]
        expected_summary = df['summary'][i]
        prompt = format_text(text, "")

        summary, orig_time = run_inference(generator, prompt, max_new_tokens, False)
        total_time_orig += orig_time
        summaries_orig.append(summary)
        
        summary, tuned_time = run_inference(generator_tuned, prompt, max_new_tokens, False)
        total_time_tuned += tuned_time
        summaries_tuned.append(summary)
        
        reference_summaries.append(expected_summary)

    print("Original Model")
    rouge = evaluate.load('rouge')
    rouge_results = rouge.compute(predictions=summaries_orig, references=reference_summaries)
    print(rouge_results)

    bleu = evaluate.load('bleu')
    results_bleu = bleu.compute(predictions=summaries_orig, references=reference_summaries)
    print(results_bleu)

    print(f"Total time (seconds): {total_time_orig}")
    print(f"Total time (minutes): {total_time_orig / 60}")

    with open(f"./results/gpt/{name}/gpt_basemodel_rouge_results.json", "w") as f:
        json.dump(rouge_results, f)
    with open(f"./results/gpt/{name}/gpt_basemodel_bleu_results.json", "w") as f:
        json.dump(results_bleu, f)
    with open(f"./results/gpt/{name}/gpt_basemodel_time.txt", "w") as f:
        f.write(f"{total_time_orig}\n") 

    print("Tuned Model")
    rouge = evaluate.load('rouge')
    rouge_results = rouge.compute(predictions=summaries_tuned, references=reference_summaries)
    print(rouge_results)

    bleu = evaluate.load('bleu')
    results_bleu = bleu.compute(predictions=summaries_tuned, references=reference_summaries)
    print(results_bleu)

    print(f"Total time (seconds): {total_time_tuned}")
    print(f"Total time (minutes): {total_time_tuned / 60}")

    with open(f"./results/gpt/{name}/gpt_tunedmodel_rouge_results.json", "w") as f:
        json.dump(rouge_results, f)
    with open(f"./results/gpt/{name}/gpt_tunedmodel_bleu_results.json", "w") as f:
        json.dump(results_bleu, f)
    with open(f"./results/gpt/{name}/gpt_tunedmodel_time.txt", "w") as f:
        f.write(f"{total_time_tuned}\n")

    results_df = pd.DataFrame({
        'summary': reference_summaries,
        'summary_orig': summaries_orig,
        'summary_tuned': summaries_tuned
    })
    results_df.to_csv(f"./results/gpt/{name}/summaries.csv")

In [14]:
df = pd.read_csv("./datasets/podcast_with_summary_test.csv")
results_df = evaluate_df(df, generator_original, generator_finetuned, max_new_tokens, "test_dataset")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Original Model
{'rouge1': 0.11662762065176122, 'rouge2': 0.031786847679097965, 'rougeL': 0.10042312952274254, 'rougeLsum': 0.09863156356162879}


Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<?, ?B/s]
Downloading extra modules: 4.07kB [00:00, 8.16MB/s]                   
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<?, ?B/s]


{'bleu': 0.0163049426861892, 'precisions': [0.14116532449076266, 0.024425989252564728, 0.006555723651033787, 0.003126628452318916], 'brevity_penalty': 1.0, 'length_ratio': 1.5625462620281274, 'translation_length': 2111, 'reference_length': 1351}
Total time (seconds): 21.7005558013916
Total time (minutes): 0.36167593002319337
Tuned Model
{'rouge1': 0.32541007175008846, 'rouge2': 0.12926667321440155, 'rougeL': 0.2824104823908641, 'rougeLsum': 0.2802352911554624}
{'bleu': 0.06376209772052989, 'precisions': [0.2912621359223301, 0.10020040080160321, 0.03778467908902691, 0.014989293361884369], 'brevity_penalty': 1.0, 'length_ratio': 1.5247964470762398, 'translation_length': 2060, 'reference_length': 1351}
Total time (seconds): 21.71223020553589
Total time (minutes): 0.3618705034255981


In [15]:
df = pd.read_csv("./datasets/podcast_with_summary_train.csv")
results_df = evaluate_df(df, generator_original, generator_finetuned, max_new_tokens, "train_dataset")

Original Model
{'rouge1': 0.11406947166757568, 'rouge2': 0.03524670853304594, 'rougeL': 0.09467139385076542, 'rougeLsum': 0.09525293320533855}
{'bleu': 0.018025981532378, 'precisions': [0.14202719406674907, 0.026802807913209957, 0.009234828496042216, 0.0030034129692832765], 'brevity_penalty': 1.0, 'length_ratio': 1.4803293687099726, 'translation_length': 8090, 'reference_length': 5465}
Total time (seconds): 83.93328380584717
Total time (minutes): 1.398888063430786
Tuned Model
{'rouge1': 0.3406716434427197, 'rouge2': 0.1517999227470468, 'rougeL': 0.29215083541823106, 'rougeLsum': 0.2917579534757738}
{'bleu': 0.08059614010230476, 'precisions': [0.3016712757003727, 0.12006946167204167, 0.05085179966696554, 0.02290783898305085], 'brevity_penalty': 1.0, 'length_ratio': 1.5218664226898444, 'translation_length': 8317, 'reference_length': 5465}
Total time (seconds): 83.88731384277344
Total time (minutes): 1.3981218973795573


In [16]:
# read the entire dataset for the final evaluation
df = pd.read_csv("./datasets/podcast_with_summary.csv")
results_df = evaluate_df(df, generator_original, generator_finetuned, max_new_tokens, "whole_dataset")

Original Model
{'rouge1': 0.11476014096201276, 'rouge2': 0.03470648299934744, 'rougeL': 0.09530533245046108, 'rougeLsum': 0.0954583078943993}
{'bleu': 0.017698402932009097, 'precisions': [0.14184883834918147, 0.026310463468933416, 0.008679284743281397, 0.003028991778450887], 'brevity_penalty': 1.0, 'length_ratio': 1.49662558685446, 'translation_length': 10201, 'reference_length': 6816}
Total time (seconds): 106.34720301628113
Total time (minutes): 1.7724533836046854
Tuned Model
{'rouge1': 0.3381712395919966, 'rouge2': 0.1465574398520959, 'rougeL': 0.2902232533451762, 'rougeLsum': 0.28947029045098505}
{'bleu': 0.07736636474037178, 'precisions': [0.29960489544184254, 0.11612646649433286, 0.04825957490502105, 0.021337579617834394], 'brevity_penalty': 1.0, 'length_ratio': 1.5224471830985915, 'translation_length': 10377, 'reference_length': 6816}
Total time (seconds): 106.43535470962524
Total time (minutes): 1.773922578493754
