In [None]:
import os
from huggingface_hub import InferenceClient
from PyPDF2 import PdfReader
from transformers import pipeline
from datasets import load_dataset
import torch
import soundfile as sf

  from .autonotebook import tqdm as notebook_tqdm


### Script Generation

In [3]:
def get_pdf_content(pdf_path):
    pdf_content = PdfReader(pdf_path)

    extracted_text = ""
    for page in pdf_content.pages:
        extracted_text += page.extract_text()

    return extracted_text

In [4]:
def get_llm_response(prompt):
    client = InferenceClient(
        provider="novita",
        api_key=os.environ["HF_TOKEN_READ"],
    )
    completion = client.chat.completions.create(
        model="deepseek-ai/DeepSeek-R1-0528",
        messages=prompt,
    )

    return completion

In [None]:
in_folder_path = "../data"
in_file_name = "research_paper.pdf"
in_file_path = f"{in_folder_path}/{in_file_name}"
extracted_content = get_pdf_content(in_file_path)

podcast_question_gen_prompt = [
                                {
                                    "role": "system",
                                    "content": """Generate 5 thoughtful and engaging questions for a podcast, based only on the core topics from the provided research paper.

                                                        The audience includes both technical professionals and business managers. So, write the questions in a clear, jargon-free way—balancing accuracy with simplicity.

                                                        Focus on the main contributions, methods, or challenges highlighted in the paper. Questions should spark discussion and make the topic approachable for a wide audience.""",
                                },
                                {"role": "user", "content": extracted_content},
                            ]
raw_podcast_questions = get_llm_response(podcast_question_gen_prompt)
podcast_questions = (
    raw_podcast_questions
    .choices[0]
    .message.content.split("</think>")[-1]
)

podcast_script_gen_prompt = [
                                {
                                    "role": "system",
                                    "content": f"""Generate a podcast script based on the research paper content provided by the user.

                                    Use the following structure:
                                    - Introduction: Briefly introduce the research topic in simple, layman-friendly language.
                                    - Core Discussion: Using the questions below, generate clear and concise answers derived from the user content. Maintain a conversational and engaging tone, suitable for both technical professionals and non-experts.

                                    Questions:
                                    {podcast_questions}

                                    - Conclusion: Summarize the conversation and conclude the podcast script.

                                    Ensure the entire script is easy to understand, avoids heavy jargon, and flows like a real podcast episode. Format the script as a monologue podcast where a single speaker walks the audience through the questions and answers."""
                                },
                                {
                                    "role": "user",
                                    "content": extracted_content
                                }
                            ]

raw_podcast_script = get_llm_response(podcast_script_gen_prompt)
final_podcast_script = raw_podcast_script.choices[0].message.content.split("</think>")[-1]

out_folder_path = "../transcripts"
out_file_name = in_file_name.split(".")[0] + ".txt"
out_folder_path = f"{out_folder_path}/{out_file_name}"

with open(out_folder_path, "w") as f:
    f.write(final_podcast_script)

#### Exploring the output

In [None]:
print("---- Thought Process ----")
print(raw_podcast_questions.choices[0].message.content.split("</think>")[0])
print("\n---- Actual Output ----")
print(podcast_questions)

---- Thought Process ----
<think>
Hmm, the user wants me to generate podcast questions based on the "Attention Is All You Need" paper for a mixed technical and business audience. Let me analyze the paper's core contributions first.

The paper introduces the Transformer architecture which replaces RNNs/CNNs entirely with attention mechanisms. Key innovations include multi-head attention, positional encodings, and the encoder-decoder structure. The results show significant improvements in translation quality (28.4 BLEU for EN-DE) with much faster training times (3.5 days vs weeks).

For the podcast format, I need to avoid technical jargon while keeping questions accurate. The audience includes both engineers and managers, so questions should bridge technical concepts with business impact. 

First question could focus on the fundamental shift - asking why attention alone suffices instead of RNNs. Second question should explain multi-head attention simply using the "different perspectives"

In [None]:
print("---- Thought Process ----")
print(raw_podcast_script.choices[0].message.content.split("</think>")[0])
print("\n---- Actual Output ----")
print(final_podcast_script)

---- Thought Process ----
<think>
Alright, let me tackle this podcast script request about the "Attention Is All You Need" paper. The user wants a conversational yet informative monologue covering five specific questions, avoiding heavy jargon while keeping it engaging for both technical and non-technical audiences. 

First, I need to distill the paper's essence: it introduced the Transformer architecture that relies solely on attention mechanisms, ditching RNNs/CNNs. The key innovations are multi-head attention, positional encoding, and the encoder-decoder structure. The paper demonstrates superior translation results with faster training times.

For the introduction, I'll start with a relatable analogy about language processing to hook listeners. Then I'll address each question systematically:
1) For "why attention is all you need," I'll contrast sequential processing in RNNs with Transformer's parallel attention, using the "spotlight" metaphor from the paper's visualizations.
2) Exp

### Audio Generation

In [21]:
def generate_audio(podcast_script):
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

    speaker_embeddings = embeddings_dataset[7306]["xvector"]
    speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)
    pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts")

    # limiting to first 600 words, due to the model limitation
    speech = pipe(podcast_script[:600], forward_params={"speaker_embeddings": speaker_embeddings})

    return speech

In [None]:
podcast_audio = generate_audio(final_podcast_script)

out_audio_path = "../audio"
out_audio_file_name = in_file_name.split(".")[0] + ".wav"
out_audio_folder_path = f"{out_audio_path}/{out_audio_file_name}"

sf.write(out_audio_folder_path, podcast_audio["audio"], samplerate=podcast_audio["sampling_rate"])