<a href="https://colab.research.google.com/github/mshumer/gpt-podcast/blob/main/gpt_podcast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# gpt-podcast
By Matt Shumer (https://twitter.com/mattshumer_)

Github repo: https://github.com/mshumer/gpt-podcast

Generate an fictional podcast in minutes with AI.

To generate a podcast:
1.  In the second cell, add in your OpenAI and ElevenLabs keys (make sure you have a paid ElevenLabs account).
2. Fill in the podcast participants, topic, and number of turns in the third cell.
3. Replace `speaker_one_clip`, `speaker_one_voice_description`, `speaker_two_clip`, `speaker_two_voice_description` with relevant YouTube videos and voice descriptions.
3. Run all the cells! After some time, your podcast file should appear in the filesystem.

This is not for commercial use -- purely for research and fun!

In [None]:
!pip install openai elevenlabs pydub pytube moviepy

In the next cell, add your OpenAI and ElevenLabs keys. You need a paid ElevenLabs account for this to work!

In [None]:
import openai
import requests
import time
from pydub import AudioSegment
from IPython.display import Audio
openai.api_key = "YOUR OPENAI KEY HERE"
eleven_labs_api_key = "YOUR ELEVENLABS KEY HERE" # make sure you have a paid account!

# Enter the names of the podcast participants here, and add a topic to discuss. Finally, choose how many turns (back-and-forth conversation changes) you want the podcast to have.

(note -- for more than 12 turns, you may want to switch from `gpt-4` to `gpt-4-32k`).

In [None]:
speaker_one = "Joe Rogan"
speaker_two = "Rick Sanchez, of Rick and Morty"

topic = "Artificial Intelligence"

number_of_turns = 7

# Clone the relevant voices.

Replace with a YouTube clip of the first speaker you want to use.

In [None]:
speaker_one_clip = "https://www.youtube.com/watch?v=CM_LWxh33Z8" # a video of speaker_one speaking
speaker_one_voice_description = "American male, deep voice."

Replace with a YouTube clip of the second speaker you want to use.

In [None]:
speaker_two_clip = "https://www.youtube.com/watch?v=GKPcHInn14c" # a video of speaker_two speaking
speaker_two_voice_description = "A bit gruff and raspy, often slurred due to his constant state of inebriation. His speech is punctuated with frequent burps and stammers. He speaks with a cynical and sarcastic tone, often sounding dismissive or condescending. His voice also has a certain manic energy to it, reflecting his chaotic personality."

Now let's create the voices!

In [None]:
import os
from pytube import YouTube
from moviepy.editor import *
import requests

def download_and_trim_audio(clip_url, filename, max_size):
    yt = YouTube(clip_url)
    stream = yt.streams.filter(only_audio=True).first()
    stream.download(filename=f"{filename}.webm")

    audio = AudioFileClip(f"{filename}.webm")
    audio.write_audiofile(f"{filename}.mp3")

    file_size = os.path.getsize(f"{filename}.mp3") / (1024 * 1024)
    initial_duration = audio.duration

    if file_size > max_size:
        new_duration = (max_size / file_size) * initial_duration
        audio = audio.subclip(0, new_duration)
        audio.write_audiofile(f"{filename}.mp3")

        final_duration = audio.duration
        print(f"Initial duration: {initial_duration:.2f} seconds")
        print(f"Final duration: {final_duration:.2f} seconds")
        print(f"Trimmed: {initial_duration - final_duration:.2f} seconds")

def upload_to_api(filename, name, description, api_key):
    url = "https://api.elevenlabs.io/v1/voices/add"
    headers = {
      "Accept": "application/json",
      "xi-api-key": api_key,
    }
    data = {
        'name': name,
        'labels': '{"accent": "American"}',
        'description': description,
    }
    files = [
        ('files', (f"{filename}.mp3", open(f"{filename}.mp3", 'rb'), 'audio/mpeg')),
    ]
    response = requests.post(url, headers=headers, data=data, files=files)
    return response.json()['voice_id']

# Speaker 1
download_and_trim_audio(speaker_one_clip, 'speaker_one', 9)
voice_one_id = upload_to_api('speaker_one', 'Podcast Voice #1', speaker_one_voice_description, eleven_labs_api_key)

# Speaker 2
download_and_trim_audio(speaker_two_clip, 'speaker_two', 9)
voice_two_id = upload_to_api('speaker_two', 'Podcast Voice #2', speaker_two_voice_description, eleven_labs_api_key)

In [None]:
def generate_podcast(name1, name2, name1_voice, name2_voice, topic, num_turns):
    conversation_history = []
    system_prompt1 = {"role": "system", "content": f"You are {name1}. You are recording a podcast with {name2} about {topic}. Talk as naturally as possible -- use the language {name1} would actually use. Don't just blindly agree — debate, discuss, and have fun! Respond with one message per turn. Don't include anything other than your response."}
    system_prompt2 = {"role": "system", "content": f"You are {name2}. You are recording a podcast with {name1} about {topic}. Talk as naturally as possible -- use the language {name2} would actually use. Don't just blindly agree — debate, discuss, and have fun! Respond with one message per turn. Don't include anything other than your response."}

    full_audio = None

    for i in range(num_turns):  # Limit the conversation to 5 turns for each character
        for name, system_prompt in [(name1, system_prompt1), (name2, system_prompt2)]:
            try:
              # Adjust the role of each speaker in the conversation history
              adjusted_history = [{"role": "assistant" if msg["role"] == name else "user", "content": msg["content"]} for msg in conversation_history]
              adjusted_history.append(system_prompt)
              response = openai.ChatCompletion.create(
                  model="gpt-4",
                  messages=adjusted_history,
                  presence_penalty=.7,
              )
              message = response.choices[0].message['content']
              print(f"{name}: {message}")
              conversation_history.append({"role": name, "content": message})
            except:
              time.sleep(30)
              # Adjust the role of each speaker in the conversation history
              adjusted_history = [{"role": "assistant" if msg["role"] == name else "user", "content": msg["content"]} for msg in conversation_history]
              adjusted_history.append(system_prompt)
              response = openai.ChatCompletion.create(
                  model="gpt-4",
                  messages=adjusted_history,
                  presence_penalty=.7,
              )
              message = response.choices[0].message['content'].replace('*(burps)*', '').replace('*(laughs)*', '').replace('*laughs and burps*', '').replace('*belches and laughs*', '')
              print(f"{name}: {message}")
              conversation_history.append({"role": name, "content": message})

            if name == name1:
              voice = name1_voice
            else:
              voice = name2_voice

            # Generate and save audio for the message
            try:
              CHUNK_SIZE = 1024
              url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice}"
              headers = {
                "Accept": "audio/mpeg",
                "Content-Type": "application/json",
                "xi-api-key": eleven_labs_api_key,
              }
              data = {
                "text": message,
                "model_id": "eleven_monolingual_v1",
                "voice_settings": {
                  "stability": 0.5,
                  "similarity_boost": 0.5
                }
              }
              tts_response = requests.post(url, json=data, headers=headers)
              filename = f'{name}_turn_{i}.mp3'
              with open(filename, 'wb') as f:
                  for chunk in tts_response.iter_content(chunk_size=CHUNK_SIZE):
                      if chunk:
                          f.write(chunk)
            except:
              time.sleep(30)
              CHUNK_SIZE = 1024
              url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice}"
              headers = {
                "Accept": "audio/mpeg",
                "Content-Type": "application/json",
                "xi-api-key": eleven_labs_api_key,
              }
              data = {
                "text": message,
                "model_id": "eleven_multilingual_v2",
                "voice_settings": {
                  "stability": 0.7,
                  "similarity_boost": 0.75
                }
              }
              tts_response = requests.post(url, json=data, headers=headers)
              filename = f'{name}_turn_{i}.mp3'
              with open(filename, 'wb') as f:
                  for chunk in tts_response.iter_content(chunk_size=CHUNK_SIZE):
                      if chunk:
                          f.write(chunk)

            time.sleep(3)  # Delay to ensure the file is written to disk

            # Concatenate audio
            pause = AudioSegment.silent(duration=100)  # 100ms pause
            new_audio = AudioSegment.from_mp3(filename)
            full_audio = full_audio + new_audio + pause if full_audio else new_audio

        if i == num_turns - 1:  # Last turn
          wrap_up_message = {"role": "system", "content": "This is your last turn to speak, wrap it up."}
          conversation_history.append(wrap_up_message)

    # Export full audio
    full_audio.export("podcast.mp3", format="mp3")

    # Play the audio in the notebook
    return Audio("podcast.mp3")

# Now, run this cell to generate the podcast!

In [None]:
generate_podcast(speaker_one, speaker_two, voice_one_id, voice_two_id, topic, number_of_turns)