In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "phi3"
#MODEL = "mixtral:8x7b"
#MODEL = "llama2"

YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=al5AUq_bxMc"

In [2]:
from langchain_community.llms import Ollama
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings

if MODEL.startswith("gpt"):
    model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
    embeddings = OpenAIEmbeddings()
else:
    model = Ollama(model=MODEL)
    embeddings = OllamaEmbeddings(model=MODEL)



In [3]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser 

In [4]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Reema has two daughters sita and gita", question="Who is sita's sister?")

'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Reema has two daughters sita and gita\n\nQuestion: Who is sita\'s sister?\n'

In [5]:
chain = prompt | model | parser
chain.invoke({"context":"Reema has two daughters sita and gita", "question":"Who is sita's sister?"})

'Gita'

In [6]:
import subprocess
import os
import whisper
import tempfile

YOUTUBE_VIDEO = 'https://www.youtube.com/watch?v=al5AUq_bxMc'

# Function to download audio using yt-dlp
def download_audio(url, output_path):
    try:
        # Run yt-dlp as a subprocess
        result = subprocess.run(
            ['yt-dlp', '-x', '--audio-format', 'mp3', '-o', output_path, url],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        print("yt-dlp output:", result.stdout)
    except subprocess.CalledProcessError as e:
        print("yt-dlp error output:", e.stderr)
        raise

# Let's do this only if we haven't created the transcription file yet.
if not os.path.exists("transcription.txt"):
    with tempfile.TemporaryDirectory() as tmpdir:
        audio_path = os.path.join(tmpdir, "audio.mp3")
        try:
            download_audio(YOUTUBE_VIDEO, audio_path)
            
            # Let's load the base model. This is not the most accurate
            # model but it's fast.
            whisper_model = whisper.load_model("base")
            transcription = whisper_model.transcribe(audio_path, fp16=False)["text"].strip()

            with open("transcription.txt", "w") as file:
                file.write(transcription)

            print("Transcription saved to transcription.txt")
        except subprocess.CalledProcessError:
            print("Failed to download and process the audio.")
else:
    print("Transcription file already exists.")


    


Transcription file already exists.


In [9]:
with open("transcription.txt") as file:
    transcription = file.read()

transcription[:50]

'Hi guys, today we are going to look at a leadcode '

In [10]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("transcription.txt")
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'transcription.txt'}, page_content='Hi guys, today we are going to look at a leadcode problem contains duplicate. In this problem we have an integer array numbs and we need to find whether that integer array contains any duplicate value or not. If it contains any duplicate value we need to return true, else we need to return false. To solve this problem we are going to use a set of integer. Set basically have a property to contain only non distinct value and the search time of a set is big of one. So it will give the constant search time. So let us iterate over this integer array and here we are going to check if set contains this value n then we will return false. Otherwise we will simply add the element into the set. It does not contain any return value so from here we will return false. So here we need to return true if it contains a duplicate then we are returning true if not then we are returning false. Let us try to run. Submit.')]

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
documents = text_splitter.split_documents(text_documents)

In [15]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore = DocArrayInMemorySearch.from_documents(documents, embeddings)




In [29]:
retriever = vectorstore.as_retriever()

In [32]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
result = chain.invoke({"question": "What is this video about?"})
print(result)

The video discusses a method to check if an attempt to submit something includes duplicates and the associated lead code issue that arises when there are big sets involved, potentially causing problems with submission timing. The focus seems to be on troubleshooting or identifying issues within these procedures of submitting leads containing codes in large datasets.


In [33]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

result = chain.invoke({'question': "What is this video about?"})
print(result)

This YouTube video seems to be explaining how set times are calculated for leads in music or performance settings based on a code they call "leadcode." They mention that if certain conditions met by the leadtime value, it returns true (presumably indicating acceptable timing), otherwise false. However, I need more information about 'leadcode' and its calculations to give an accurate description of this video content.
