# Fine-tune a smaller GPT model

Train a smaller GPT model to produce summaries like the gpt-4o model.

## Notebook controls

In [1]:
# notebook controls
remove_old_directories = True
train_model = True
evaluate_model = True

## Remove artifacts from previous run

In [2]:
import shutil

# remove all directories from training and testing
if (remove_old_directories):
    shutil.rmtree('./results/gpt', ignore_errors=True)
    shutil.rmtree('./tuned_model', ignore_errors=True)
    shutil.rmtree('./tuned_tokenizer', ignore_errors=True)
    shutil.rmtree('./tmp', ignore_errors=True)

## Select the base model

In [3]:
# model_name = "facebook/opt-350m"
# model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "openai-community/gpt2"
model_name = "EleutherAI/gpt-neo-125m"


## Train/fine-tune the gpt model to produce summaries 

In [4]:
from SharedUtils import trim_to_max_length

def hashtag_format(text, summary):
    prompt = f"### Question: Summarize the following which is surrounded by quotes \"{text}\"\n### Answer:"
    if (len(summary) > 0):
        prompt += f" {summary}"
    return prompt

def format_text(text, summary):
    limited_text = trim_to_max_length(text)
    return hashtag_format(limited_text, summary)

response_template = "### Answer:"
    

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

if (train_model):
    dataset = load_dataset("csv", data_files="./datasets/podcast_with_summary_train.csv", split="train")

    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    def formatting_prompts_func(example):
        output_texts = []
        for i in range(len(example['text_short'])):
            txt = example['text_short'][i]
            text = format_text(txt, example['summary'][i])
            output_texts.append(text)
        
        return output_texts

    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir="./tmp",
        evaluation_strategy="steps",
        eval_steps=500,
        logging_dir='./logs',
        logging_steps=100,
        save_steps=500,
        save_total_limit=2,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        learning_rate=5e-5,
        gradient_accumulation_steps=8
        )

    trainer = SFTTrainer(
        model,
        train_dataset=dataset,
        eval_dataset=dataset,
        args=training_args,
        formatting_func=formatting_prompts_func,
        data_collator=collator,
    )

    trainer.train()

    trainer.save_model("./tuned_model")
    tokenizer.save_pretrained("./tuned_tokenizer")

Generating train split: 255 examples [00:00, 1382.12 examples/s]
Map: 100%|██████████| 255/255 [00:00<00:00, 7083.00 examples/s]
Map: 100%|██████████| 255/255 [00:00<00:00, 7727.05 examples/s]
100%|██████████| 24/24 [00:09<00:00,  2.40it/s]


{'train_runtime': 9.9907, 'train_samples_per_second': 76.572, 'train_steps_per_second': 2.402, 'train_loss': 2.3021672566731772, 'epoch': 3.0}


## Test inference using the new fine tuned model

In [6]:
import pandas as pd

df = pd.read_csv("./datasets/podcast_with_summary_test.csv")
first_text = df['text'][0]
expected_summary = df['summary'][0]

prompt = format_text(first_text, "")

print(len(prompt))
print(prompt)
print(expected_summary)

1065
### Question: Summarize the following which is surrounded by quotes "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast. If you enjoy it, subscribe on YouTube, give it five stars on Apple Podcast, support it on Patreon, or simply connect with me on Twitter at Lex Friedman, spelled F R I D M A N. As usual, I'll do one or two minutes of ads now and never any ads in the middle that can break the flow of the conversation. I hope that works for you and doesn't hurt the listening experience. This show is presented by Cash App, the number one finance ap

In [7]:
from SharedUtils import trim_to_last_punctuation

def parse_summary_from_response(response):
    # find the start token
    response_template_trimmed = response_template.strip()
    start_token = response.find(response_template_trimmed)
    if start_token == -1:
        return ""
    # trim from end of the start token to the end of the response
    result = response[start_token + len(response_template_trimmed):]
    result = trim_to_last_punctuation(result)    
    return result

In [8]:
max_new_tokens = 48

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from transformers import pipeline
import time

def create_generator(model_name, tokenizer_name):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenizer.pad_token = tokenizer.eos_token
    return pipeline("text-generation", model=model, tokenizer=tokenizer, device="cuda", truncation=True)

def run_inference(generator, prompt, max_new_tokens, log_results):
    elapsed_time = 0
    start_time = time.time()
    generated_text = generator(prompt, max_new_tokens=max_new_tokens, num_return_sequences=1)
    end_time = time.time()
    elapsed_time = end_time - start_time
    if log_results:
        print(generated_text[0]["generated_text"])
        print("\n")
    return parse_summary_from_response(generated_text[0]["generated_text"]), elapsed_time

In [10]:
generator_finetuned = create_generator("./tuned_model", "./tuned_tokenizer")
summary, elapsed = run_inference(generator_finetuned, prompt, max_new_tokens, True)

print("Summary: " + summary)

### Question: Summarize the following which is surrounded by quotes "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast. If you enjoy it, subscribe on YouTube, give it five stars on Apple Podcast, support it on Patreon, or simply connect with me on Twitter at Lex Friedman, spelled F R I D M A N. As usual, I'll do one or two minutes of ads now and never any ads in the middle that can break the flow of the conversation. I hope that works for you and doesn't hurt the listening experience. This show is presented by Cash App, the number one finance app in 

In [11]:
generator_original = create_generator(model_name, model_name)
summary, elapsed = run_inference(generator_original, prompt, max_new_tokens, True)

print("Summary: " + summary)

### Question: Summarize the following which is surrounded by quotes "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast. If you enjoy it, subscribe on YouTube, give it five stars on Apple Podcast, support it on Patreon, or simply connect with me on Twitter at Lex Friedman, spelled F R I D M A N. As usual, I'll do one or two minutes of ads now and never any ads in the middle that can break the flow of the conversation. I hope that works for you and doesn't hurt the listening experience. This show is presented by Cash App, the number one finance app in 

## Evaluate the results
Compare the fine tuned model against the base model

In [12]:
import os
os.makedirs("./results", exist_ok=True)
# remove dir and all subdirs
shutil.rmtree("./results/gpt", ignore_errors=True)
os.makedirs("./results/gpt", exist_ok=True)

In [None]:
from SharedUtils import evaluate_and_save_metrics

def evaluate_df(df, generator, generator_tuned, max_new_tokens, name):
    if (not evaluate_model):
        return

    os.makedirs(f"./results/gpt/{name}", exist_ok=True)

    total_time_orig = 0
    total_time_tuned = 0
    summaries_orig = []
    summaries_tuned = []
    reference_summaries = []
    
    for i in range(len(df)):
        text = df['text_short'][i]
        expected_summary = df['summary'][i]
        prompt = format_text(text, "")

        summary, orig_time = run_inference(generator, prompt, max_new_tokens, False)
        total_time_orig += orig_time
        summaries_orig.append(summary)
        
        summary, tuned_time = run_inference(generator_tuned, prompt, max_new_tokens, False)
        total_time_tuned += tuned_time
        summaries_tuned.append(summary)
        
        reference_summaries.append(expected_summary)

    print("Original Model")
    rouge_results, bleu_results = evaluate_and_save_metrics(
        "gpt",
        name,
        "gpt_basemodel",
        reference_summaries,
        summaries_orig,
        total_time_orig
    )
    print(rouge_results)
    print(bleu_results)
    print(f"Total time (seconds): {total_time_orig}")
    print(f"Total time (minutes): {total_time_orig / 60}")

    print("Tuned Model")
    rouge_results, bleu_results = evaluate_and_save_metrics(
        "gpt",
        name,
        "gpt_tunedmodel",
        reference_summaries,
        summaries_tuned,
        total_time_tuned
    )
    print(rouge_results)
    print(bleu_results)
    print(f"Total time (seconds): {total_time_tuned}")
    print(f"Total time (minutes): {total_time_tuned / 60}")

    results_df = pd.DataFrame({
        'summary': reference_summaries,
        'summary_orig': summaries_orig,
        'summary_tuned': summaries_tuned
    })
    results_df.to_csv(f"./results/gpt/{name}/summaries.csv")

In [14]:
df = pd.read_csv("./datasets/podcast_with_summary_test.csv")
results_df = evaluate_df(df, generator_original, generator_finetuned, max_new_tokens, "test_dataset")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Original Model
{'rouge1': 0.12072427950463045, 'rouge2': 0.03475482812254653, 'rougeL': 0.10260111806969327, 'rougeLsum': 0.09871728854113355}
{'bleu': 0.022280129157186978, 'precisions': [0.14163903363334912, 0.027357107962872496, 0.011094301563287948, 0.005732152162584679], 'brevity_penalty': 1.0, 'length_ratio': 1.5706845238095237, 'translation_length': 2111, 'reference_length': 1344}
Total time (seconds): 22.27033281326294
Total time (minutes): 0.37117221355438235
Tuned Model
{'rouge1': 0.3413474058315226, 'rouge2': 0.1329295600864056, 'rougeL': 0.2926705947648975, 'rougeLsum': 0.2858684721017868}
{'bleu': 0.06701134410781397, 'precisions': [0.3012170385395537, 0.11268343815513626, 0.04067245119305857, 0.014606741573033709], 'brevity_penalty': 1.0, 'length_ratio': 1.4672619047619047, 'translation_length': 1972, 'reference_length': 1344}
Total time (seconds): 22.206040859222412
Total time (minutes): 0.3701006809870402


In [15]:
df = pd.read_csv("./datasets/podcast_with_summary_train.csv")
results_df = evaluate_df(df, generator_original, generator_finetuned, max_new_tokens, "train_dataset")

Original Model
{'rouge1': 0.11395003781241367, 'rouge2': 0.03189152393209295, 'rougeL': 0.09362619108654371, 'rougeLsum': 0.09284479701316148}
{'bleu': 0.014962002400896252, 'precisions': [0.14165636588380717, 0.024250159540523293, 0.00712401055408971, 0.0020477815699658703], 'brevity_penalty': 1.0, 'length_ratio': 1.4970392301998519, 'translation_length': 8090, 'reference_length': 5404}
Total time (seconds): 86.42873072624207
Total time (minutes): 1.4404788454373678
Tuned Model
{'rouge1': 0.3451608840168716, 'rouge2': 0.14402487851848422, 'rougeL': 0.29473183017642296, 'rougeLsum': 0.29215741789876115}
{'bleu': 0.07406983945577654, 'precisions': [0.30342187126106723, 0.11403184005923732, 0.045871559633027525, 0.018964836033188465], 'brevity_penalty': 1.0, 'length_ratio': 1.5466321243523315, 'translation_length': 8358, 'reference_length': 5404}
Total time (seconds): 86.33567023277283
Total time (minutes): 1.4389278372128804


In [16]:
# read the entire dataset for the final evaluation
df = pd.read_csv("./datasets/podcast_with_summary.csv")
results_df = evaluate_df(df, generator_original, generator_finetuned, max_new_tokens, "whole_dataset")

Original Model
{'rouge1': 0.11545779292410834, 'rouge2': 0.03249250870475698, 'rougeL': 0.09485084723429923, 'rougeLsum': 0.09372360966943771}
{'bleu': 0.016755679752100527, 'precisions': [0.14165277913930008, 0.024893746205221615, 0.007947296873366098, 0.0028126352228472523], 'brevity_penalty': 1.0, 'length_ratio': 1.5117071724955542, 'translation_length': 10201, 'reference_length': 6748}
Total time (seconds): 106.51281809806824
Total time (minutes): 1.775213634967804
Tuned Model
{'rouge1': 0.344476235076636, 'rouge2': 0.14191126098656004, 'rougeL': 0.29394949113422897, 'rougeLsum': 0.29158740800529104}
{'bleu': 0.07278369651905087, 'precisions': [0.30300096805421106, 0.11377484766756568, 0.044882377218324394, 0.01813720260322202], 'brevity_penalty': 1.0, 'length_ratio': 1.5308239478363959, 'translation_length': 10330, 'reference_length': 6748}
Total time (seconds): 106.94402575492859
Total time (minutes): 1.7824004292488098
