In [14]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, WebBaseLoader, YoutubeLoader, DirectoryLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone
from openai import OpenAI
import numpy as np
from dotenv import load_dotenv
import tiktoken
import os

In [15]:
load_dotenv(dotenv_path='../../../.env.local')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
openrouter_api_key = os.getenv('OPENROUTER_API_KEY')
huggingface_api_token = os.getenv('HUGGINGFACE_API_TOKEN')

In [16]:
model_choice = 'openai' # 'openai' or 'huggingface'

#### Initialize OpenAI or HuggingFace Client

In [17]:
if model_choice == 'openai':
    embeddings = OpenAIEmbeddings()
    embed_model = "text-embedding-3-small"
    openai_client = OpenAI()
elif model_choice == 'huggingface':
    hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    query_result = hf_embeddings.embed_query(text)

    openrouter_client = OpenAI( # Llama 3.1 API
        base_url="https://openrouter.ai/api/v1",
        api_key=openrouter_api_key
        )

  warn_deprecated(


#### Initialize Text Splitter

In [18]:
tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=100,
        length_function=tiktoken_len,
        separators=["\n\n", "\n", " ", ""]
)

#### Embeddings

In [19]:
def get_embedding(text):
    # Call the OpenAI API to get the embedding for the text
    if model_choice == 'openai':
        response = openai_client.embeddings.create(input=text, model="text-embedding-3-small")
        embedding = response.data[0].embedding
    elif model_choice == 'huggingface':
        # response = hf_embeddings.embeddings.create(input=text, model="sentence-transformers/all-MiniLM-L6-v2")
        embedding = hf_embeddings.embed_query(text)

    return embedding

def cosine_similarity_between_words(word1, word2):
    # Get embeddings for both words
    embedding1 = np.array(get_embedding(word1))
    embedding2 = np.array(get_embedding(word2))

    # Reshape embeddings for cosine_similarity function
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    print("Embedding for Word 1:", embedding1)
    print("\nEmbedding for Word 2:", embedding2)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]


# Example usage
sentence1 = "Great to finally meet!"
sentence2 = "Nice to meet you"


similarity = cosine_similarity_between_words(sentence1, sentence2)
print(f"\n\nCosine similarity between '{sentence1}' and '{sentence2}': {similarity:.4f}")

Embedding for Word 1: [[-0.00409901 -0.02935333 -0.06173278 ...  0.0112379  -0.00698414
  -0.00348519]]

Embedding for Word 2: [[-0.00058948 -0.05813902 -0.06282136 ...  0.01669302 -0.01365682
  -0.01385192]]


Cosine similarity between 'Great to finally meet!' and 'Nice to meet you': 0.6680


#### Load YouTube video and get transcript

In [20]:
loader = YoutubeLoader.from_youtube_url("https://www.youtube.com/watch?v=ArcI4A5nvBo", add_video_info=True)
data = loader.load()

print(data)

[Document(metadata={'source': 'ArcI4A5nvBo', 'title': 'iOS 18 Hands-On: Top 5 Features!', 'description': 'Unknown', 'view_count': 5754471, 'thumbnail_url': 'https://i.ytimg.com/vi/ArcI4A5nvBo/hq720.jpg', 'publish_date': '2024-07-15 00:00:00', 'length': 766, 'author': 'Marques Brownlee'}, page_content='(shapes springing)\n(groovy hip-hop music) - All right, it\'s mid-2024,\nso you know what that means, another new version of iOS\ncoming to an iPhone near you. So, I\'ve been testing the\nnewest version of iOS 18 on my iPhone for the\npast couple weeks now. It\'s been in beta, and now\nthat the public beta\'s out, you too can test it on an iPhone before it comes out for\neveryone in September. But there is some interesting\nstuff in this version. Now, last year, I\nremember talking about how almost every single one\nof those new features was super ecosystem-based. Like, they would depend on\nhaving a friend with an iPhone or having some other\nApple device in your life, but it\'s kinda th

In [21]:
texts = text_splitter.split_documents(data)

In [22]:
texts

[Document(metadata={'source': 'ArcI4A5nvBo', 'title': 'iOS 18 Hands-On: Top 5 Features!', 'description': 'Unknown', 'view_count': 5754471, 'thumbnail_url': 'https://i.ytimg.com/vi/ArcI4A5nvBo/hq720.jpg', 'publish_date': '2024-07-15 00:00:00', 'length': 766, 'author': 'Marques Brownlee'}, page_content="(shapes springing)\n(groovy hip-hop music) - All right, it's mid-2024,\nso you know what that means, another new version of iOS\ncoming to an iPhone near you. So, I've been testing the\nnewest version of iOS 18 on my iPhone for the\npast couple weeks now. It's been in beta, and now\nthat the public beta's out, you too can test it on an iPhone before it comes out for\neveryone in September. But there is some interesting\nstuff in this version. Now, last year, I\nremember talking about how almost every single one\nof those new features was super ecosystem-based. Like, they would depend on\nhaving a friend with an iPhone or having some other\nApple device in your life, but it's kinda the opp

#### Insert data in Pinecone

In [23]:
if model_choice == 'openai':
    index_name = "youtube-video-chatbot-openai"
    e = embeddings
elif model_choice == 'huggingface':
    index_name = "youtube-video-chatbot-huggingface"
    e = hf_embeddings
namespace = "youtube-videos"


vectorstore = PineconeVectorStore(index_name=index_name, embedding=e)

In [24]:
for document in texts:
    print("\n\n\n\n----")

    print(document.metadata, document.page_content)

    print('\n\n\n\n----')





----
{'source': 'ArcI4A5nvBo', 'title': 'iOS 18 Hands-On: Top 5 Features!', 'description': 'Unknown', 'view_count': 5754471, 'thumbnail_url': 'https://i.ytimg.com/vi/ArcI4A5nvBo/hq720.jpg', 'publish_date': '2024-07-15 00:00:00', 'length': 766, 'author': 'Marques Brownlee'} (shapes springing)
(groovy hip-hop music) - All right, it's mid-2024,
so you know what that means, another new version of iOS
coming to an iPhone near you. So, I've been testing the
newest version of iOS 18 on my iPhone for the
past couple weeks now. It's been in beta, and now
that the public beta's out, you too can test it on an iPhone before it comes out for
everyone in September. But there is some interesting
stuff in this version. Now, last year, I
remember talking about how almost every single one
of those new features was super ecosystem-based. Like, they would depend on
having a friend with an iPhone or having some other
Apple device in your life, but it's kinda the opposite with these. They're all just ba

In [25]:
if model_choice == 'openai':
    e = embeddings
elif model_choice == 'huggingface':
    e = hf_embeddings

vectorstore_from_texts = PineconeVectorStore.from_texts([f"Source: {t.metadata['source']}, Title: {t.metadata['title']} \n\nContent: {t.page_content}" for t in texts], e, index_name=index_name, namespace=namespace)

#### Perform RAG

In [26]:
# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key,)

# Connect to your Pinecone index
pinecone_index = pc.Index(index_name)

In [27]:
query = "What is this video about?"

In [28]:
if model_choice == 'openai':
    raw_query_embedding = openai_client.embeddings.create(input=[query], model="text-embedding-3-small")
elif model_choice == 'huggingface':
    raw_query_embedding = openai_client.embeddings.create(input=[query], model="multilingual-e5-large")

query_embedding = raw_query_embedding.data[0].embedding

In [29]:
top_matches = pinecone_index.query(vector=query_embedding, top_k=10, include_metadata=True, namespace=namespace)

In [30]:
# List of retrieved texts, feed into prompt of llm
contexts = [item['metadata']['text'] for item in top_matches['matches']]

In [31]:
augmented_query = "\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n\n\n\n\nMY QUESTION:\n" + query

In [32]:
primer = f"""You are a personal assistant. Answer any questions I have about the YouTube Video provided. 
You always answer questions based on the information provided in the video.
"""
if model_choice == 'openai':
    res = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": primer},
            {"role": "user", "content": augmented_query},
        ],
    )
elif model_choice == 'huggingface':
    res = openrouter_client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct:free",
        messages=[
            {"role": "system", "content": primer},
            {"role": "user", "content": augmented_query},
        ],
    )

answer = res.choices[0].message.content

In [33]:
answer

"The video is a hands-on review of the top five features of iOS 18, which is the latest version of Apple's operating system for iPhones. The presenter discusses their experience testing the public beta of iOS 18 and highlights features that are focused on enhancing the iPhone experience, rather than being dependent on other Apple devices. \n\nThe five key features discussed include:\n\n1. **New Control Center** - A more colorful and paginated Control Center with customizable controls and new options for managing media playback.\n\n2. **Passwords App** - A standalone app that organizes and simplifies password management, including options for two-factor authentication and pass keys.\n\n3. **Home Screen Customization** - New options for arranging app icons freely on the home screen and customizing themes, although there are some challenges with the new tinting features.\n\n4. **Various Small Enhancements** - Features that improve user experience, such as game mode for gaming performance,

#### Putting it all together

In [34]:
def perform_rag_openai(query):
    raw_query_embedding = openai_client.embeddings.create(
        input=query,
        model="text-embedding-3-small"
    )

    query_embedding = raw_query_embedding.data[0].embedding

    top_matches = pinecone_index.query(vector=query_embedding, top_k=10, include_metadata=True, namespace=namespace)

    # Get the list of retrieved texts
    contexts = [item['metadata']['text'] for item in top_matches['matches']]

    augmented_query = "\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n\n\n\n\nMY QUESTION:\n" + query

    # Modify the prompt below as need to improve the response quality
    system_prompt = f"""You are an expert personal assistant. Answer any questions I have about the Youtube Video provided. You always answer questions based only on the context that you have been provided.
    """

    res = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": augmented_query}
        ]
    )

    return res.choices[0].message.content


In [35]:
perform_rag_openai("What is this video about?")

'The video titled "iOS 18 Hands-On: Top 5 Features!" discusses the new features of the iOS 18, which is set to be released for iPhones in September 2024 after being in beta testing. The creator shares their experience of using the new version and highlights five key features of iOS 18, which include:\n\n1. **New Control Center** - A more colorful and paginated Control Center with enhanced controls and customization options.\n2. **Passwords App** - A new app that consolidates password management previously found in settings, allowing for easier access and sharing.\n3. **Home Screen Customization** - Enhanced capabilities for arranging app icons and new options for theme settings (dark/light modes).\n4. **Helpful Smaller Features** - Includes improvements in the Photos app, game mode for better performance during gaming, and Shazam integration.\n5. **Calculator with Math Notes** - A new calculator feature that allows users to write equations by hand and get solutions dynamically, along w

#### RAG on PDF

In [None]:
loader = PyPDFLoader("/content/Harry Potter and the Sorcerers Stone.pdf") # Insert the path to a PDF here
data = loader.load()

print(data)

text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=100,
        length_function=tiktoken_len,
        separators=["\n\n", "\n", " ", ""]
    )

texts = text_splitter.split_documents(data)

# Insert all the chunks from the PDF into Pinecone
vectorstore_from_texts = PineconeVectorStore.from_texts([f"Source: {t.metadata['source']}, Title: {t.metadata['title']} \n\nContent: {t.page_content}" for t in texts], embeddings, index_name=index_name, namespace=namespace)

# After this, all the code is the same from the Perform RAG section of this notebook
# Since the data from the PDF is now stored in Pinecone, you can perform RAG over it the same way as the YouTube video