# Generate Target Summaries

Use OpenAI GPT-4o to generate the target summaries our models will be trained on.

In [1]:
# %pip install openai
# %pip install --upgrade openai

## Read the kaggle dataset

In [2]:
import pandas as pd

dataset_dir = 'datasets'

# read the dataset
df = pd.read_csv(f'{dataset_dir}/podcast.csv')
df.head()

Unnamed: 0,text,text_length,sentiment,word_count,unique_word_count,pos_tags,entities,clean_text
0,"As part of MIT course 6S099, Artificial Genera...",74068,0.171118,13424,2726,"[('As', 'IN'), ('part', 'NN'), ('of', 'IN'), (...","[('MIT', 'ORG'), ('Artificial General Intellig...",part mit course artificial general intelligenc...
1,As part of MIT course 6S099 on artificial gene...,57788,0.116574,10217,2474,"[('As', 'IN'), ('part', 'NN'), ('of', 'IN'), (...","[('MIT', 'ORG'), ('Christoph Koch', 'PERSON'),...",part mit course artificial general intelligenc...
2,"You've studied the human mind, cognition, lang...",33940,0.106534,5989,1762,"[('You', 'PRP'), (""'ve"", 'VBP'), ('studied', '...","[('Plato', 'ORG'), ('B.', 'NORP'), ('Nietzsche...",studied human mind cognition language vision e...
3,What difference between biological neural netw...,34228,0.106092,5993,1528,"[('What', 'WP'), ('difference', 'NN'), ('betwe...","[('First', 'ORDINAL'), ('One', 'CARDINAL'), ('...",difference biological neural networks artifici...
4,The following is a conversation with Vladimir ...,36171,0.125654,6374,1481,"[('The', 'DT'), ('following', 'NN'), ('is', 'V...","[('Vladimir Vapnik', 'PERSON'), ('VC', 'PERSON...",following conversation vladimir vapnik co inve...


In [3]:
# we only want the text column
df = df[['text']]
df = df.dropna()

## Generate the target summaries by calling openai

In [None]:
from openai import OpenAI

api_key = ""
client = OpenAI(
    api_key=api_key,
)

In [5]:
from SharedUtils import trim_to_max_length

# trim the long length summary to a shorter length
df['text_short'] = df['text'].apply(lambda x: trim_to_max_length(x))
df.head()

Unnamed: 0,text,text_short
0,"As part of MIT course 6S099, Artificial Genera...","As part of MIT course 6S099, Artificial Genera..."
1,As part of MIT course 6S099 on artificial gene...,As part of MIT course 6S099 on artificial gene...
2,"You've studied the human mind, cognition, lang...","You've studied the human mind, cognition, lang..."
3,What difference between biological neural netw...,What difference between biological neural netw...
4,The following is a conversation with Vladimir ...,The following is a conversation with Vladimir ...


In [6]:
df['text_short'][126]

"The following is a conversation with Joe Rogan that we recorded after my recent appearance on his podcast, The Joe Rogan Experience. Joe has been a inspiration to me and I thank to millions of people for just being somebody who puts love out there in the world and being genuinely curious about wild ideas from chimps and psychedelics to quantum mechanics and artificial intelligence. Like many of you, I've been a fan of his podcast for over a decade and now, somehow, miraculously, am humbled to be able to call him a friend. If you enjoy this thing, subscribe on YouTube, review it with five stars on Apple Podcast, follow on Spotify, support on Patreon, or connect with me on Twitter at Lex Friedman. Today's sponsors are Neuro, Eight Sleep, Dollar Shave Club, and Olive Garden, home of the Unlimited Breadsticks and Brian Redband's favorite restaurant. Check out the first three of the sponsors in the description to get a discount and to support this podcast."

In [7]:
prompt = "Write a short summary, limited to 20 words, for the following podcast transcript: "

def generate_summary(text, index):
    # gpt3 has a maximum context length is 16385 tokens.
    # I guesstimated the characters that allow for 16385 tokens
    # max_text_length = 65000
    # if len(text) > max_text_length:
    #     text = text[:max_text_length]

    try:
        prompt_to_send = prompt + "\"" + text + "\""
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt_to_send,
                }
            ],
            # model="gpt-3.5-turbo",
            model="gpt-4o"
        )
        response_summary = chat_completion.choices[0].message.content
        # remove "Summary:" from the response
        response_summary = response_summary.replace("Summary:", "")
        return response_summary
    except Exception as e:
        print(f"Error generating summary for text: {e}, Index {index}")
        print(f"Text Length: {len(text)}")
        return None

# generate row 1 summary
print(generate_summary(df['text_short'][126], 126))

Lex Friedman discusses his admiration for Joe Rogan, promoting his podcast and sponsors, including Neuro, Eight Sleep, and Dollar Shave Club.


In [8]:
import time
start_time = time.time()

df['summary'] = df['text_short'].apply(lambda x: generate_summary(x, df['text_short'].index[df['text_short'] == x][0]))

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken to generate {df.shape[0]} summaries: {elapsed_time:.2f} seconds")

Time taken to generate 319 summaries: 321.52 seconds


In [9]:
# get elapsed time in minutes
elapsed_time_minutes = elapsed_time / 60
print(f"Time taken to generate {df.shape[0]} summaries: {elapsed_time_minutes:.2f} minutes")

Time taken to generate 319 summaries: 5.36 minutes


In [10]:
# do a second pass to generate a second round of summaries
# we can use these to gather metrics (rouge, bleu, etc) on 4o summaries against itself
df['summary2'] = df['text_short'].apply(lambda x: generate_summary(x, df['text_short'].index[df['text_short'] == x][0]))

In [11]:
df.head()

Unnamed: 0,text,text_short,summary,summary2
0,"As part of MIT course 6S099, Artificial Genera...","As part of MIT course 6S099, Artificial Genera...",Podcast features MIT's Max Tegmark discussing ...,MIT course features Max Tegmark discussing AI'...
1,As part of MIT course 6S099 on artificial gene...,As part of MIT course 6S099 on artificial gene...,Podcast features conversation with Christoph K...,Christoph Koch discusses his influential work ...
2,"You've studied the human mind, cognition, lang...","You've studied the human mind, cognition, lang...","The podcast explores the meaning of life, sugg...","The podcast explores the meaning of life, sugg..."
3,What difference between biological neural netw...,What difference between biological neural netw...,The podcast explores the mysterious biological...,The podcast explores the mysterious credit ass...
4,The following is a conversation with Vladimir ...,The following is a conversation with Vladimir ...,"Lex Friedman interviews Vladimir Vapnik, discu...","A conversation with Vladimir Vapnik on AI, lea..."


In [12]:
# save the dataset
df.to_csv(f'{dataset_dir}/podcast_with_summary.csv', index=False)

In [13]:
# also save a train and test dataset
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

train.to_csv(f'{dataset_dir}/podcast_with_summary_train.csv', index=False)
test.to_csv(f'{dataset_dir}/podcast_with_summary_test.csv', index=False)

## Generate baseline metrics that can be compared against

Calculate the rouge and bleu for 4o summaries vs 4o summaries

In [None]:
from SharedUtils import evaluate_and_save_metrics

reference_summaries = df['summary'].tolist()
generated_summaries = df['summary2'].tolist()

rouge_results, bleu_results = evaluate_and_save_metrics(
    "openai-gpt-4o",
    "baseline",
    "gpt_4o",
    reference_summaries,
    generated_summaries,
    elapsed_time
)

print(rouge_results)
print(bleu_results)


{'rouge1': 0.5523705292308793, 'rouge2': 0.29373887479712146, 'rougeL': 0.4728234984742418, 'rougeLsum': 0.4719001178605867}
{'bleu': 0.23449828267202746, 'precisions': [0.5632099129665142, 0.28622291021671825, 0.16984204526950009, 0.11044314668498798], 'brevity_penalty': 1.0, 'length_ratio': 1.0045939537640782, 'translation_length': 6779, 'reference_length': 6748}
