In [1]:
# notebook controls
remove_old_directories = False
train_model = False
evaluate_model = True

In [2]:
import shutil

# remove all directories from training and testing
if (remove_old_directories):
    shutil.rmtree('./results/gpt', ignore_errors=True)
    shutil.rmtree('./tuned_model', ignore_errors=True)
    shutil.rmtree('./tuned_tokenizer', ignore_errors=True)
    shutil.rmtree('./tmp', ignore_errors=True)

In [3]:
# model_name = "facebook/opt-350m"
# model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "openai-community/gpt2"
model_name = "EleutherAI/gpt-neo-125m"


In [4]:
max_length = 512

def trim_to_last_punctuation(text):
    # attempt to trim any of the cut off sentences
    # reverse find the last punctuation
    last_punctuation = -1
    for p in ['.', '!', '?']:
        last_punctuation = text.rfind(p)
        if last_punctuation != -1:
            break
    if last_punctuation != -1:
        text = text[:last_punctuation + 1]
    return text

def hashtag_format(text, summary):
    prompt = f"### Question: Summarize the following which is surrounded by quotes \"{text}\"\n### Answer:"
    if (len(summary) > 0):
        prompt += f" {summary}"
    return prompt

def format_text(text, summary):
    limited_text = text[:max_length]
    limited_text = trim_to_last_punctuation(limited_text)
    return hashtag_format(limited_text, summary)

response_template = " ### Answer:"
    

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

if (train_model):
    dataset = load_dataset("csv", data_files="./datasets/podcast_with_summary_train.csv", split="train")

    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    max_length = 1024

    def formatting_prompts_func(example):
        output_texts = []
        for i in range(len(example['text'])):
            txt = example['text'][i]
            text = format_text(txt, example['summary'][i])
            output_texts.append(text)
        
        return output_texts

    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir="./tmp",
        evaluation_strategy="steps",
        eval_steps=500,
        logging_dir='./logs',
        logging_steps=100,
        save_steps=500,
        save_total_limit=2,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        learning_rate=5e-5,
        gradient_accumulation_steps=8
        )

    trainer = SFTTrainer(
        model,
        train_dataset=dataset,
        eval_dataset=dataset,
        args=training_args,
        formatting_func=formatting_prompts_func,
        data_collator=collator,
    )

    trainer.train()

    trainer.save_model("./tuned_model")
    tokenizer.save_pretrained("./tuned_tokenizer")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import pandas as pd

df = pd.read_csv("./datasets/podcast_with_summary_test.csv")
first_text = df['text'][0]
expected_summary = df['summary'][0]

prompt = format_text(first_text, "")

print(len(prompt))
print(prompt)
print(expected_summary)

576
### Question: Summarize the following which is surrounded by quotes "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast."
### Answer:
.楽The AI Fund focuses on building new companies from scratch that leverage AI technology. The Fund also aims to address challenges such as bias in AI and the ethical use of technology. The Fund helps startups integrate machine learning into their business and scale their AI efforts for maximum impact.


In [7]:
def parse_summary_from_response(response):
    # find the start token
    response_template_trimmed = response_template.strip()
    start_token = response.find(response_template_trimmed)
    if start_token == -1:
        return ""
    # trim from end of the start token to the end of the response
    result = response[start_token + len(response_template_trimmed):]
    result = trim_to_last_punctuation(result)    
    return result

In [8]:
max_new_tokens = 48

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from transformers import pipeline

def create_generator(model_name, tokenizer_name):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenizer.pad_token = tokenizer.eos_token
    return pipeline("text-generation", model=model, tokenizer=tokenizer, device="cuda", truncation=True)

def run_inference(generator, prompt, max_new_tokens, log_results=False):
    generated_text = generator(prompt, max_new_tokens=max_new_tokens, num_return_sequences=1)
    # generated_text = generator(prompt, max_length=1024, num_return_sequences=1)
    if log_results:
        print(generated_text[0]["generated_text"])
        print("\n")
    return parse_summary_from_response(generated_text[0]["generated_text"])

In [10]:
generator_finetuned = create_generator("./tuned_model", "./tuned_tokenizer")
summary = run_inference(generator_finetuned, prompt, max_new_tokens, True)

print("Summary: " + summary)

### Question: Summarize the following which is surrounded by quotes "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast."
### Answer: Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general, discusses the impact of artificial intelligence and its potential for improving education, innovation, and society. He discusses the


Summary:  Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space 

In [11]:
generator_original = create_generator(model_name, model_name)
summary = run_inference(generator_original, prompt, max_new_tokens, True)

print("Summary: " + summary)

### Question: Summarize the following which is surrounded by quotes "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast."
### Answer: "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI


Summary:  "The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technolo

In [12]:
import os
os.makedirs("./results", exist_ok=True)
# remove dir and all subdirs
shutil.rmtree("./results/gpt", ignore_errors=True)
os.makedirs("./results/gpt", exist_ok=True)

In [13]:
import evaluate
import json

def evaluate_df(df, generator, generator_tuned, max_new_tokens, name):
    if (not evaluate_model):
        return

    os.makedirs(f"./results/gpt/{name}", exist_ok=True)

    summaries_orig = []
    summaries_tuned = []
    reference_summaries = []
    for i in range(len(df)):
        text = df['text'][i]
        expected_summary = df['summary'][i]
        prompt = format_text(text, "")
        summary = run_inference(generator, prompt, max_new_tokens)
        summaries_orig.append(summary)
        summary = run_inference(generator_tuned, prompt, max_new_tokens)
        summaries_tuned.append(summary)
        reference_summaries.append(expected_summary)

    rouge = evaluate.load('rouge')
    results = rouge.compute(predictions=summaries_orig, references=reference_summaries)
    print("Original Model")
    print(results)
    with open(f"./results/gpt/{name}/gpt_basemodel_rouge_results.json", "w") as f:
        json.dump(results, f)

    results = rouge.compute(predictions=summaries_tuned, references=reference_summaries)
    print("Tuned Model")
    print(results)
    with open(f"./results/gpt/{name}/gpt_tuned_rouge_results.json", "w") as f:
        json.dump(results, f)

    results_df = pd.DataFrame({
        'summary': reference_summaries,
        'summary_orig': summaries_orig,
        'summary_tuned': summaries_tuned
    })
    results_df.to_csv(f"./results/gpt/{name}/summaries.csv")

In [15]:
df = pd.read_csv("./datasets/podcast_with_summary_test.csv")
results_df = evaluate_df(df, generator_original, generator_finetuned, max_new_tokens, "test_dataset")

Original Model
{'rouge1': 0.0961155934905295, 'rouge2': 0.01812604849107987, 'rougeL': 0.07455373229382672, 'rougeLsum': 0.06043839586568904}
Tuned Model
{'rouge1': 0.07933848323030451, 'rouge2': 0.01512984165246287, 'rougeL': 0.05914637566199343, 'rougeLsum': 0.059186080745931305}


In [16]:
df = pd.read_csv("./datasets/podcast_with_summary_train.csv")
results_df = evaluate_df(df, generator_original, generator_finetuned, max_new_tokens, "train_dataset")

Original Model
{'rouge1': 0.09169151977706039, 'rouge2': 0.017574309230125557, 'rougeL': 0.07038032658676768, 'rougeLsum': 0.0593643230362514}
Tuned Model
{'rouge1': 0.08183894650298472, 'rouge2': 0.019099755191932608, 'rougeL': 0.061673981088244054, 'rougeLsum': 0.061566912565636206}
