In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType

In [2]:
def parse_url(url: str) -> str:
    """
    Extract video ID from URL.

    Args: 
        url(str): youtube video url

    Returns:
        Youtube video's video ID
    
    """
    if "=" in url:
        return url.split("=")[-1]

    return url


def get_text_from_video(url: str) -> str:
    """
    Get transcript text from YouTube video.

    Args:
        url(str): youtube video url

    Returns:
        Youtube video's transcripted text
    
    """
    video_id = parse_url(url)
    
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = " ".join([entry["text"] for entry in transcript])
        transcript_text = transcript_text.replace("\n", " ").replace("'", "")
        return transcript_text
    except Exception as e:
        return f"Failed to retrieve transcript: {str(e)}"

In [11]:
dstax = get_text_from_video("https://www.youtube.com/watch?v=zmLQ0yBaN3U")

In [4]:
with open('dimon.txt',"w") as f:
    f.write(dimon)

In [12]:
dstax

'Failed to retrieve transcript: \nCould not retrieve a transcript for the video https://www.youtube.com/watch?v=zmLQ0yBaN3U! This is most likely caused by:\n\nSubtitles are disabled for this video\n\nIf you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!'

In [18]:
#load text from file
with open("xgove_team_call.txt", "r") as f:
    transcript = f.read()

In [6]:
def create_chunks(transcript_text: str) -> list:
    """
    Split transcript text into processable chunks.

    Args:
        transcript_text (str): Youtube video's transcripted text

    Returns:
        processable chunks
    
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_text(transcript_text)
    return chunks

In [7]:
chunks = create_chunks(dimon)

In [8]:
chunks

['Failed to retrieve transcript: \nCould not retrieve a transcript for the video https://www.youtube.com/watch?v=2s! This is most likely caused by:\n\nSubtitles are disabled for this video\n\nIf you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!']

In [22]:
chunks[0]

"You can start after I share the screen. Alright, Marta, you're good to go. Hello, good morning, good afternoon, good evening everyone. Welcome to our enablement session. Thank you so much for taking your precious time to join us today. Today we will be exploring the new key features of WatsonX governance along with the new sales tactics defined along with around of them. So with you today, if we can move to the next slide. My name is Marta Świątkiewicz-Tańska. I am Worldwide WatsonX governance sales leader and I'm thrilled to be joined by our worldwide cross function team. And we have a yet field that we are joined also by our guest speaker, Mohamed, who will share with us a fabulous one IBM win story. Before we start, let's go through very quick rundown. What is on our agenda for today? So we will start with quick recap. What is available available today in terms of software, SaaS, WatsonX governance deployment option. Then we will go into our product enhancements. And as you can"

In [23]:
def get_summary(chunks: list) -> str:
    """
    Summarize text chunks and create a single summary.
    
    Args:
        chunks (list): processable chunks of transcriptted text

    Returns:
        A single summary for youtube video
    """
    llm = OllamaLLM(model="llama3")

    template = """Text: {text}
    Goal: Summarize given text.
    Answer: """

    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | llm

    summaries = [chain.invoke({"text": chunk}) for chunk in chunks]
    
    combined_summary = " ".join(summaries)
    
    # Create final summary
    final_summary_prompt = ChatPromptTemplate.from_template(
        "Multiple summaries: {summaries}\nGoal: Create a coherent single summary.\nAnswer: "
    )
    final_summary_chain = final_summary_prompt | llm
    final_summary = final_summary_chain.invoke({"summaries": combined_summary})
    
    return final_summary

In [24]:
ds_summ = get_summary(chunks)

In [10]:
ds_summ

"Here's a concise summary:\n\nA 50-year-old individual embarks on a new journey, transitioning from mere survival to thriving. After facing various life challenges, including divorce, single parenthood, and significant life changes, they've realized the importance of living intentionally. To share their experiences and connect with others who may be facing similar struggles, they're starting a vlog that documents their self-discovery, adventure, and growth. The speaker aims to inspire others by showcasing everyday moments, sharing lessons learned from parenting, and exploring healthy living practices. By doing so, they hope to demonstrate that even small moments can hold beauty and significance, and that it's never too late to start making positive changes in life."

In [25]:
with open("xgov_summ.txt", "w") as f:
    f.write(ds_summ)

In [26]:
def extract_topics(chunks:list) -> list:
    """
    Extract main topics from text chunks.
    
    Args:
        chunks (list): processable chunks of transcriptted text
    
    Returns:
        Main topic list
    """
    llm = OllamaLLM(model="llama3")

    template = """Text: {text}
    Goal: Extract main topics from the given text.
    Answer: List the key topics separated by commas."""

    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | llm

    topics_list = [chain.invoke({"text": chunk}) for chunk in chunks]

    # Combine topics from different chunks
    all_topics = set()
    for topics in topics_list:
        # Split comma-separated topics and clean whitespace
        topic_items = [t.strip() for t in topics.split(",")]
        all_topics.update(topic_items)

    # Remove empty elements
    all_topics = {topic for topic in all_topics if topic}
    
    return list(all_topics)

In [27]:
tops = extract_topics(chunks)

In [28]:
len(tops)
with open("xgov_topics.txt", "w") as f:
    f.write("\n".join(tops))        

In [29]:
def extract_quotes(chunks:list) -> list:
    """
    Extract important quotes from text chunks.
    
    Args:
        chunks (list): processable chunks of transcriptted text
    
    Returns:
        important quotes list
    """
    llm = OllamaLLM(model="llama3")
    template = """Text: {text}
    Goal: Extract the most important quote from this text.
    Answer: Provide the quote as plain text."""

    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | llm

    quotes = [chain.invoke({"text": chunk}) for chunk in chunks]
    
    # Filter duplicate or empty quotes
    unique_quotes = []
    seen_quotes = set()
    
    for quote in quotes:
        # Normalize quote (clean whitespace and compare lowercase)
        normalized = quote.strip().lower()
        if normalized and normalized not in seen_quotes:
            unique_quotes.append(quote.strip())
            seen_quotes.add(normalized)
    
    return unique_quotes

In [30]:
quotes = extract_quotes(chunks)

In [31]:
with open("xgov_quotes.txt", "w") as f:
    f.write("\n".join(quotes))

In [32]:
quotes

['"Welcome to our enablement session. Thank you so much for taking your precious time to join us today."',
 'Here is the most important quote extracted from the text:\n\n"We have very interesting agenda for today."\n\nThis quote stands out as it sets the tone for the meeting and hints at the exciting topics that will be covered.',
 '"What we can see on this slide, this is our IBM Business Value Institute point of view where we are shifting from experimenting to scaling and optimization of governance and innovating all the related use cases."',
 'Here is the most important quote:\n\n"end to end governance program platform, which is consolidated around risk and compliance and model management capabilities."',
 '"We expect in second half of this year, the WatsonX governance standard plan, which is combining risk console and model management capabilities."',
 '"One of the core elements that our clients are looking to drive towards is the idea of being able to manage and understand their co