In [3]:
import sys
sys.path.append('/app')

In [95]:
!pip install pytube==15.0.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pytube==15.0.0
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0
[0m

In [4]:
url = 'https://www.youtube.com/watch?v=oFSyNdQf5uk&ab_channel=LexFridman'

In [5]:
from youtube_io import get_transcription, chunk_with_overlap, extract_youtube_video_id

video_id = extract_youtube_video_id(url)
transcription = get_transcription(video_id)
transcription_chunks = chunk_with_overlap(transcription, 200, 50)

In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from tqdm.auto import tqdm

model = ChatOpenAI(model='gpt-3.5-turbo-1106', temperature=0)

  from .autonotebook import tqdm as notebook_tqdm


ChatGPT chat about prompts: https://chat.openai.com/share/642ed416-d428-4cf8-9279-8a8de6ec5a71

In [56]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

transcription_summary_template = PromptTemplate.from_template(
"""
Generate a condensed version of the following transcription, adhering strictly to these requirements:

% Requirements:
1. Start directly with the core messages and facts, avoiding any introductory phrases such as "Summary:" or "The speaker discusses...".
2. Exclude narrative fluff, interpretations, or indirect commentary, focusing exclusively on the essential information distilled from the original text.
3. Produce a concise, straightforward summary optimized for analysis and indexing in a vector database, facilitating the construction of a RAG system. The output should seamlessly integrate into database entries without the need for further editing to remove contextual introductions.

Transcription:
{transcription}
"""
)

joined_texts = []
for chunk in transcription_chunks:
    joined_texts.append('\n'.join([i['text'] for i in chunk]))

def make_prediction(j_text):
    prompt = transcription_summary_template.format(transcription=j_text)
    return model.predict(prompt)

chunk_summaries = []
with ThreadPoolExecutor() as executor:
    future_to_jtext = {executor.submit(make_prediction, j_text): j_text for j_text in joined_texts}
    
    for future in tqdm(as_completed(future_to_jtext), total=len(joined_texts)):
        try:
            result = future.result()
            chunk_summaries.append(result)
        except Exception as exc:
            print(f'Generated an exception: {exc}')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:10<00:00,  1.79it/s]


In [65]:
print(chunk_summaries[7])

The protests against Israel have seen a diverse group of people, including Jewish, Muslim, and Indigenous groups, calling for freedom and an end to the genocide in Gaza. Former IDF soldiers have been seen spraying Palestinian protestors with skunk water, causing health issues. The protests are not anti-Semitic, but anti-occupation. There has been a rise in anti-Semitism and anti-Muslim hate in the US. Benjamin Netanyahu has committed himself to the erasure of Palestinian people and land.


In [66]:
import chromadb

chroma_client = chromadb.PersistentClient(path='/app/db')

In [67]:
print(chroma_client.list_collections())

[Collection(name=ofsyndqf5uk)]


In [108]:
from pytube import YouTube

def get_yt_metadata(video_id):
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    yt = YouTube(video_url)

    return {
        "title": yt.title,
        "author": yt.author
    }

collection = chroma_client.get_or_create_collection(name=video_id.lower(), metadata=get_yt_metadata(video_id))

In [109]:
if collection.count() == 0:
    print(f"The collection for video ID {video_id} is empty.")
    ids = [f'{i}' for i in range(len(chunk_summaries))]

    collection.add(
                documents=chunk_summaries,
                ids=ids
            )

In [89]:
def answer_main_question(collection, question, llm, n_results=10):
    prompt_to_dataquestion_template = PromptTemplate.from_template(
        """
Create a search query from the given prompt, focusing solely on essential keywords and facts. 
This query will be used to retrieve specific information from a database, so it must be concise and packed with relevant terms.

Prompt:
{prompt}
        """
    )

    query = llm.predict( prompt_to_dataquestion_template.format(prompt=question) )
    
    results = collection.query(
        query_texts=[query],
        n_results=n_results,
    )

    docs = results['documents'][0]

    vdb_query_prompt_template = PromptTemplate.from_template(
        """
Giving these summaries from the research {summaries}
Answer the following question: {question}
        """
    )

    prompt = vdb_query_prompt_template.format(
        summaries=docs,
        question=question,
    )

    out = llm.predict( prompt )
    return out

In [90]:
question = "What was video about? Give comprehensive sammary"
answer_main_question(collection, question, model, 10)

"The video was about the ongoing oppression and violence against Palestinians, particularly in the context of the Israeli-Palestinian conflict. It discussed the lack of attention given to the stabbing of a six-year-old Palestinian boy, Wadea, compared to the immediate coverage of other incidents. The video highlighted the hypocrisy in the response to Palestinian lives being lost and criticized the US government's role in funding and supporting Israel's actions. It emphasized the need for the world to act against apartheid and occupation, and called for the US to stop funding the conflict and to remove itself from the peace process. The video also discussed the impact of Islamophobia in the United States, the significance of Masjid Al-Aqsa for Muslims, and the ongoing crisis in Palestine. It addressed the biased media coverage, the plight of Palestinians, and the lack of effective international bodies of justice to hold Israel accountable for its actions. Additionally, it mentioned the 

In [112]:
import streamlit as st

In [113]:
select_box = st.selectbox("Select a collection", ['1', '2'])

2024-02-13 17:58:43.027 
  command:

    streamlit run /usr/local/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [117]:
type(select_box)

str

# Get collections

In [1]:
import chromadb

In [2]:
chroma_client = chromadb.PersistentClient(path='/app/db')

In [7]:
collection_names = [item.name for item in chroma_client.list_collections()]
collection_names

['ofsyndqf5uk']