In [1]:
# %pip install openai
# %pip install --upgrade openai

## Read the kaggle dataset

In [32]:
# for Colab only

from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/ColabNotebooks/AAI590_Capstone')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
# for colab
dataset_dir = '/content/drive/MyDrive/ColabNotebooks/AAI590_Capstone/Datasets'

In [34]:
import pandas as pd

# commented out for colab
# dataset_dir = 'datasets'

# read the dataset
df = pd.read_csv(f'{dataset_dir}/podcast_cleaned.csv')
df.head()

Unnamed: 0,clean_text,text_length,sentiment,word_count,unique_word_count,pos_tags,entities
0,part mit course artificial general intelligenc...,42635,0.165822,6070,1810,"[('part', 'NN'), ('mit', 'NN'), ('course', 'NN...","[('max', 'PERSON'), ('two', 'CARDINAL'), ('fir..."
1,part mit course artificial general intelligenc...,33308,0.112002,4565,1616,"[('part', 'NN'), ('mit', 'NN'), ('course', 'NN...","[('first', 'ORDINAL'), ('first', 'ORDINAL'), (..."
2,studied human mind cognition language vision e...,20779,0.092404,2823,1213,"[('studied', 'VBN'), ('human', 'JJ'), ('mind',...","[('plato', 'ORG'), ('power nietzsche', 'ORG'),..."
3,difference biological neural networks artifici...,20780,0.101862,2823,990,"[('difference', 'NN'), ('biological', 'JJ'), (...","[('first', 'ORDINAL'), ('one', 'CARDINAL'), ('..."
4,following conversation vladimir vapnik co inve...,21541,0.120489,2959,875,"[('following', 'VBG'), ('conversation', 'NN'),...","[('soviet union', 'GPE'), ('moscow', 'GPE'), (..."


In [4]:
# we only want the text column
# df = df[['text']]
df = df[['clean_text']]
df = df.dropna()

In [5]:
df.head()

Unnamed: 0,clean_text
0,part mit course artificial general intelligenc...
1,part mit course artificial general intelligenc...
2,studied human mind cognition language vision e...
3,difference biological neural networks artifici...
4,following conversation vladimir vapnik co inve...


## Generate the target summaries by calling openai

In [20]:
from openai import OpenAI

# insert secret key
api_key = ""
client = OpenAI(
    api_key=api_key,
)

In [7]:
!pip install evaluate
!pip install rouge_score



In [8]:
from SharedUtils import trim_to_max_length

# trim the long length summary to a shorter length
# df['text_short'] = df['text'].apply(lambda x: trim_to_max_length(x))
df['text_short'] = df['clean_text'].apply(lambda x: trim_to_max_length(x))
df.head()

Unnamed: 0,clean_text,text_short
0,part mit course artificial general intelligenc...,part mit course artificial general intelligenc...
1,part mit course artificial general intelligenc...,part mit course artificial general intelligenc...
2,studied human mind cognition language vision e...,studied human mind cognition language vision e...
3,difference biological neural networks artifici...,difference biological neural networks artifici...
4,following conversation vladimir vapnik co inve...,following conversation vladimir vapnik co inve...


In [9]:
df['text_short'][126]

'following conversation joe rogan recorded recent appearance podcast joe rogan experience joe inspiration thank millions people somebody puts love world genuinely curious wild ideas chimps psychedelics quantum mechanics artificial intelligence like many fan podcast decade somehow miraculously humbled able call friend enjoy thing subscribe youtube review five stars apple podcast follow spotify support patreon connect twitter lex friedman today sponsors neuro eight sleep dollar shave club olive garden home unlimited breadsticks brian redband favorite restaurant check first three sponsors description get discount support podcast usually full ad reads never ads middle time go straight conversation bit guitar first unlimited breadsticks dollar shave club ponder mortality afraid death think sometimes mean pop head sometimes fact mean everything goes great less years left everything goes great like car accidents injuries could happen today could last day could kind stoic thing meditate death 

In [22]:
prompt = "Write a short summary, limited to 20 words, for the following podcast transcript: "

def generate_summary(text, index):
    # gpt3 has a maximum context length is 16385 tokens.
    # I guesstimated the characters that allow for 16385 tokens
    # max_text_length = 65000
    # if len(text) > max_text_length:
    #     text = text[:max_text_length]

    try:
        prompt_to_send = prompt + "\"" + text + "\""
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt_to_send,
                }
            ],
            model="gpt-3.5-turbo",
            # model="gpt-4o"
        )
        response_summary = chat_completion.choices[0].message.content
        # remove "Summary:" from the response
        response_summary = response_summary.replace("Summary:", "")
        return response_summary
    except Exception as e:
        print(f"Error generating summary for text: {e}, Index {index}")
        print(f"Text Length: {len(text)}")
        return None

# generate row 1 summary
print(generate_summary(df['text_short'][126], 126))

Inspirational chat with Joe Rogan on wild ideas, psychedelics, artificial intelligence, and pondering mortality. Subscribe and follow for more.


In [23]:
import time
start_time = time.time()

df['summary'] = df['text_short'].apply(lambda x: generate_summary(x, df['text_short'].index[df['text_short'] == x][0]))

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken to generate {df.shape[0]} summaries: {elapsed_time:.2f} seconds")

Time taken to generate 319 summaries: 173.10 seconds


In [24]:
# get elapsed time in minutes
elapsed_time_minutes = elapsed_time / 60
print(f"Time taken to generate {df.shape[0]} summaries: {elapsed_time_minutes:.2f} minutes")

Time taken to generate 319 summaries: 2.89 minutes


In [25]:
# do a second pass to generate a second round of summaries
# we can use these to gather metrics (rouge, bleu, etc) on 4o summaries against itself
df['summary2'] = df['text_short'].apply(lambda x: generate_summary(x, df['text_short'].index[df['text_short'] == x][0]))

In [26]:
df.head()

Unnamed: 0,clean_text,text_short,summary,summary2
0,part mit course artificial general intelligenc...,part mit course artificial general intelligenc...,"MIT physicist Max Tegmark discusses AGI, exist...","Max Tegmark, MIT professor, explores existenti..."
1,part mit course artificial general intelligenc...,part mit course artificial general intelligenc...,"Neurobiology expert discusses consciousness, i...",Seminal neurobiologist discusses consciousness...
2,studied human mind cognition language vision e...,studied human mind cognition language vision e...,"Podcast discusses human cognitive evolution, f...","Podcast explores human cognition, evolution, p..."
3,difference biological neural networks artifici...,difference biological neural networks artifici...,Exploring the deep mysteries and potential imp...,Differences between biological and artificial ...
4,following conversation vladimir vapnik co inve...,following conversation vladimir vapnik co inve...,Vladimir Vapnik discusses support vector machi...,"Conversation with Vladimir Vapnik, co-inventor..."


In [30]:
# save the dataset
# df.to_csv(f'{dataset_dir}/podcast_with_summary.csv', index=False)
df.to_csv(f'{dataset_dir}/podcast_cleaned_with_summary.csv', index=False)

In [31]:
# also save a train and test dataset
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

# train.to_csv(f'{dataset_dir}/podcast_with_summary_train.csv', index=False)
# test.to_csv(f'{dataset_dir}/podcast_with_summary_test.csv', index=False)
train.to_csv(f'{dataset_dir}/podcast_cleaned_with_summary_train.csv', index=False)
test.to_csv(f'{dataset_dir}/podcast_cleaned_with_summary_test.csv', index=False)

## Generate baseline metrics that can be compared against

Calculate the rouge and bleu for 4o summaries vs 4o summaries

In [29]:
from SharedUtils import evaluate_and_save_metrics

reference_summaries = df['summary'].tolist()
generated_summaries = df['summary2'].tolist()

rouge_results, bleu_results = evaluate_and_save_metrics(
    "openai-gpt-4o",
    "baseline",
    "gpt_4o",
    reference_summaries,
    generated_summaries,
    elapsed_time
)

print(rouge_results)
print(bleu_results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'rouge1': 0.4993096626853485, 'rouge2': 0.2440082378154983, 'rougeL': 0.42121686595882435, 'rougeLsum': 0.4208269907189529}
{'bleu': 0.1936020727084029, 'precisions': [0.5213316547768919, 0.24421669106881405, 0.134234372600215, 0.08220284237726098], 'brevity_penalty': 1.0, 'length_ratio': 1.0018217488789238, 'translation_length': 7149, 'reference_length': 7136}
