# Problem statement

Generate a personalized newsletter every week that summarizes each podcast episode released in that week. It would include information about the guest on the episode, the main topics discussed as well as some highlights. It would work by collecting a list of RSS feeds from the user and on a periodic basis, process the latest episodes and generate the newsletter. 

# Approach

- Part 1: use a Speech to Text model from OpenAI called Whisper to transcribe the podcast.
- Part 2: use a Large Language Model from OpenAI to build the information extraction functionality to get insights from the podcast.
- Part 3: use chatGPT from OpenAI as your coding assistant to create and deploy a front-end that allows users to experience the end to end functionality



Define the function `download_podcast_episode` that takes in the RSS feed and downloads the latest podcast episode as an MP3 file.

In [None]:
import os

def download_podcast_episode(rss_url, local_path="/content/"):
  print ("Starting Podcast Transcription Function")
  print ("Feed URL: ", rss_url)

  # Read from the RSS Feed URL
  import feedparser
  intelligence_feed = feedparser.parse(rss_url)
  podcast_title = intelligence_feed['feed']['title']
  episode_title = intelligence_feed.entries[0]['title']
  episode_image = intelligence_feed['feed']['image'].href
  for item in intelligence_feed.entries[0].links:
    if (item['type'] == 'audio/mpeg'):
      episode_url = item.href
  episode_name = "podcast_episode.mp3"
  print ("RSS URL read and episode URL: ", episode_url)

  # Setup the path variable
  from pathlib import Path
  p = Path(local_path)
  p.mkdir(exist_ok=True)

  # Download the latest podcast episode
  print ("Downloading the podcast episode")
  import requests
  with requests.get(episode_url, stream=True) as r:
    r.raise_for_status()
    episode_path = p.joinpath(episode_name)
    with open(episode_path, 'wb') as f:
      for chunk in r.iter_content(chunk_size=8192):
        f.write(chunk)

  print ("Podcast Episode downloaded")
  return episode_path, podcast_title, episode_title, episode_image

Next, define the transcription function - `transcribe_podcast_episode` that uses the Distill Whisper model to transcribe the podcast episode.

In [68]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

def transcribe_podcast_episode(episode_path):
  # Run the transcription process
  print (episode_path)

  device = "cuda:0" if torch.cuda.is_available() else "cpu"
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

  model_id = "distil-whisper/distil-medium.en"

  model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
  )
  model.to(device)

  processor = AutoProcessor.from_pretrained(model_id)

  pipe = pipeline(
      "automatic-speech-recognition",
      model=model,
      tokenizer=processor.tokenizer,
      feature_extractor=processor.feature_extractor,
      max_new_tokens=128,
      chunk_length_s=15,
      batch_size=16,
      torch_dtype=torch_dtype,
      device=device,
  )

  result = pipe("/content/podcast_episode.mp3")
  podcast_transcript = result["text"]
  print ("Podcast Transcription Completed")
  return podcast_transcript

Finally add all the information extraction functions - `get_podcast_summary`, `get_podcast_guest` and `get_podcast_highlights` which each make calls to the OpenAI LLM.

In [69]:
from openai import OpenAI
from google.colab import userdata

client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

def get_podcast_summary(podcast_transcript):
  instructPrompt = """
  You are an expert copywriter who is responsible for publishing newsletters with thousands of subscribers. You recently listened to a great podcast
  and want to share a summary of it with your readers. Please write the summary of this podcast making sure to cover the important aspects that were
  discussed and please keep it concise.
  The transcript of the podcast is provided below.
  """
  request = instructPrompt + podcast_transcript
  chatOutput = client.chat.completions.create(model="gpt-3.5-turbo-16k",
                                            messages=[{"role": "system", "content": "You are a helpful assistant."},
                                                      {"role": "user", "content": request}
                                                      ]
                                            )
  podcastSummary = chatOutput.choices[0].message.content
  return podcastSummary

def get_podcast_guest(podcast_transcript):
  import wikipedia
  import json
  request = podcast_transcript[:10000]
  completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": request}],
    functions=[
    {
        "name": "get_podcast_guest_information",
        "description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google",
        "parameters": {
            "type": "object",
            "properties": {
                "guest_name": {
                    "type": "string",
                    "description": "The full name of the guest who is speaking in the podcast",
                },
                "guest_organization": {
                    "type": "string",
                    "description": "The full name of the organization that the podcast guest belongs to or runs",
                },
                "guest_title": {
                    "type": "string",
                    "description": "The title, designation or role of the podcast guest in their organization",
                },
            },
            "required": ["guest_name"],
        },
    }],
    function_call={"name": "get_podcast_guest_information"})
  response_message = completion.choices[0].message

  # Extract the relevant information
  podcast_guest = ""
  podcast_guest_org = ""
  podcast_guest_title = ""

  if response_message.function_call:
    function_name = response_message.function_call.name
    function_args = json.loads(response_message.function_call.arguments)
    podcast_guest=function_args.get("guest_name")
    podcast_guest_org=function_args.get("guest_organization")
    podcast_guest_title=function_args.get("guest_title")

  if (podcast_guest is not None):
    if (podcast_guest_org is None):
      podcast_guest_org = ""
    if (podcast_guest_title is None):
      podcast_guest_title = ""
    try:
        input = wikipedia.page(podcast_guest + " " + podcast_guest_org + " " + podcast_guest_title, auto_suggest=True)
        podcast_guest_summary = input.summary
    except wikipedia.exceptions.PageError:
        print(f'The page for guest "{podcast_guest}" does not exist on Wikipedia.')
        podcast_guest_summary = "Not Available"
    except wikipedia.exceptions.DisambiguationError as e:
        print(f'The page for guest "{podcast_guest}" is ambiguous. Possible matches are:')
        print(e.options)
        podcast_guest_summary = "Not Available"
  else:
    podcast_guest = "Not Available"
    podcast_guest_org = "Not Available"
    podcast_guest_title = "Not Available"
    podcast_guest_summary = "Not Available"

  podcastGuest = {}
  podcastGuest['name'] = podcast_guest
  podcastGuest['org'] = podcast_guest_org
  podcastGuest['title'] = podcast_guest_title
  podcastGuest['summary'] = podcast_guest_summary
  return podcastGuest

def get_podcast_highlights(podcast_transcript):
  instructPrompt = """
  You are a podcast editor and producer. You are provided with the transcript of a podcast episode and have to identify the 5 most significant moments in the podcast as highlights.
  - Each highlight needs to be a statement by one of the podcast guests
  - Each highlight has to be impactful and an important takeaway from this podcast episode
  - Each highlight must be concise and make listeners want to hear more about why the podcast guest said that
  - The highlights that you pick must be spread out throughout the episode

  Provide only the highlights and nothing else. Provide the full sentence of the highlight and format it as follows -

  - Highlight 1 of the podcast
  - Highlight 2 of the podcast
  - Highlight 3 of the podcast
  """
  request = instructPrompt + podcast_transcript
  chatOutput = client.chat.completions.create(model="gpt-3.5-turbo-16k",
                                            messages=[{"role": "system", "content": "You are a helpful assistant."},
                                                      {"role": "user", "content": request}
                                                      ]
                                            )
  podcastHighlights = chatOutput.choices[0].message.content
  return podcastHighlights

Lastly, create the `process_podcast` function that defines the entire pipeline and provides us the output in a dictionary file.

In [70]:
def process_podcast(url):
  output = {}
  podcast_episode_path, podcast_title, episode_title, episode_image = download_podcast_episode(url)
  podcast_transcription = transcribe_podcast_episode(podcast_episode_path)
  podcast_summary = get_podcast_summary(podcast_transcription)
  podcast_guest = get_podcast_guest(podcast_transcription)
  podcast_highlights = get_podcast_highlights(podcast_transcription)
  output['podcast_title'] = podcast_title
  output['episode_title'] = episode_title
  output['episode_image'] = episode_image
  output['podcast_summary'] = podcast_summary
  output['podcast_guest'] = podcast_guest
  output['podcast_highlights'] = podcast_highlights
  output['podcast_transcription'] = podcast_transcription
  return output

To test the entire process, we can call it directly with a podcast RSS URL of your choice.

Please make sure that any podcast you choose has episodes in the 25-30 minute range.

In [None]:
%%time
# Let's test our entire podcast process
out = process_podcast("https://feeds.megaphone.fm/HS6260485755")

Additional helper function to save the produced podcast summaries and other information into a JSON file that can be retrieved by the front-end.

In [73]:
import json
with open("/content/podcast-1.json", "w") as outfile:
  json.dump(out, outfile)

# Create a Gradio front-end application that displays the podcast information


In [87]:
import os
import json

def create_dict_from_json_files(folder_path):
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
    data_dict = {}

    for file_name in json_files:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as file:
            podcast_info = json.load(file)
            podcast_name = podcast_info['podcast_title']
            # Process the file data as needed
            data_dict[podcast_name] = podcast_info

    return data_dict

In [88]:
available_podcast_info = create_dict_from_json_files('/content/')

In [97]:
def generate_podcast_episode_html(podcast_info):
    title = podcast_info['podcast_title']
    summary = podcast_info['podcast_summary']
    image_src = podcast_info['episode_image']
    guest = podcast_info['podcast_guest']['name']
    highlight = podcast_info['podcast_highlights']
    # Define the HTML content as a string with placeholders for variables
    html_content = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title}</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background: linear-gradient(135deg, #6e8efb, #a777e3);
            color: #fff;
            margin: 0;
            padding: 0;
            height: 100vh;
        }}

        .container {{
            max-width: 960px;
            margin: 20px auto;
            background-color: rgba(255, 255, 255, 0.8);
            border-radius: 10px;
            padding: 20px;
            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
        }}

        .episode-title {{
            font-size: 24px;
            font-weight: bold;
            margin-bottom: 20px;
            color: #333;
        }}

        .episode-summary {{
            margin-bottom: 20px;
            color: #444;
        }}

        .episode-image {{
            width: 100%;
            max-width: 100%;
            height: auto;
            border-radius: 5px;
        }}

        .episode-highlights,
        .episode-guest {{
            margin-top: 20px;
            background-color: rgba(255, 255, 255, 0.9);
            padding: 10px;
            border-radius: 5px;
        }}

        .episode-highlights h2,
        .episode-guest h2 {{
            font-size: 18px;
            font-weight: bold;
            margin-bottom: 10px;
            color: #5a5a5a;
        }}

        ul {{
            list-style-type: disc;
            margin-left: 20px;
        }}
    </style>
</head>
<body>
    <div class="container">
        <h1 class="episode-title">{title}</h1>
        <div class="episode-summary">
            <p>
                {summary}
            </p>
        </div>
        <img class="episode-image" src="{image_src}" alt="{title}">
        <div class="episode-highlights">
            <h2>Podcast Highlights</h2>
            <ul>
                {highlight}
            </ul>
        </div>
        <div class="episode-guest">
            <h2>Podcast Guest</h2>
            <p>
                {guest}
            </p>
        </div>
    </div>
</body>
</html>
    """

    return html_content

In [95]:
def show_podcast_info(podcast_name):
  podcast_info = available_podcast_info[podcast_name]
  return generate_podcast_episode_html(podcast_info)

def process_podcast_info(rss_url):
  podcast_info = process_podcast(rss_url)
  return generate_podcast_episode_html(podcast_info)

In [None]:
import gradio as gr

with gr.Blocks() as demo:
    podcast_name = gr.Dropdown(available_podcast_info.keys(), label="podcast_name")
    show_podcast_button = gr.Button("Show Podcast Summary")
    podcast_url = gr.Textbox(label="podcast_url")
    process_podcast_button = gr.Button("Process Podcast")
    output= gr.HTML(label="podcast_info")
    show_podcast_button.click(fn=show_podcast_info, inputs=podcast_name, outputs=output, api_name="show_podcast_info")
    process_podcast_button.click(fn=process_podcast_info, inputs=podcast_url, outputs=output, api_name="process_podcast_info")

demo.launch(debug=True)