## Working Test

In [None]:
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

load_dotenv()

# Loading The PDF File and Splitting it into Pages
loader = PyPDFLoader("2205.15868v1-CogVideo-Large-scale Pretraining for Text-to-Video.pdf")
pages = loader.load_and_split()

# Chunking the Pages into fixed size chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
documents = text_splitter.split_documents(pages)

# Converting the documents into embeddings and storing them in a FAISS Vector Store
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectordb = FAISS.from_documents(documents, embedding=embedding)
store_name = loader.source[:-4]

query = ("What is the main idea of the paper? WHat are the math formulas used in this paper")

docs = vectordb.similarity_search(query=query, k=5)
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")
chain = load_qa_chain(llm=llm, chain_type="stuff")
response = chain.run(input_documents=docs, question=query)
print(response)

In [None]:
# # CHAINS
# https://python.langchain.com/v0.1/docs/modules/chains/

# # DOCUMENT LOADERS
# https://python.langchain.com/v0.2/docs/integrations/document_loaders/


## Testing Class

In [2]:
from langchain_community.document_loaders import (PyPDFLoader, 
                                                  Docx2txtLoader, 
                                                  UnstructuredExcelLoader, 
                                                  UnstructuredPowerPointLoader,
                                                  TextLoader,
                                                  CSVLoader,
                                                  YoutubeLoader)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI 
import os
import chromadb
files = [
"files/2205.15868v1-CogVideo-Large-scale Pretraining for Text-to-Video.pdf",
'files/IM-Report.docx',
"files/UNIT-2.pptx",
"files/Hperverge OT list final.xlsx",
"files/hey.txt",
"files/test.md",
]
file_path = files[2]
print(file_path)

class Model:
    def __init__(self, llm, embeddings):
        self.llm = llm
        self.embeddings = embeddings


class ScanDocuments:
    def __init__(self):
        pass
    
    def upload_single_file(self, path):
        if path.endswith(".pdf"):
            loader = PyPDFLoader(path)
        elif path.endswith(".docx"):
            loader = Docx2txtLoader(path)
        elif path.endswith(".xlsx"):
            loader = UnstructuredExcelLoader(path, encoding = 'UTF-8')
        elif path.endswith(".pptx"):
            loader = UnstructuredPowerPointLoader(path, encoding = 'UTF-8')
        elif path.endswith(".txt") or path.endswith(".md"):
            loader = TextLoader(path, encoding = 'UTF-8')
        elif path.endswith(".csv"):
            loader = CSVLoader(path, encoding = 'UTF-8')
        data = loader.load_and_split()
        return data
    
    def upload_url(self, url):
        if "www.youtube.com" in url:
            loader =  YoutubeLoader.from_youtube_url(url,add_video_info=True)
        data = loader.load_and_split()
        # elif url.
        return data
        
    def upload_zip_file(self):
        pass
    def process_document(self, data):
        doc_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
        documents = doc_splitter.split_documents(data)
        return documents


class VectorStore:
    def __init__(self):
        self.vectordb = None

    def create_vector_store(self, embedding):
        if os.path.exists("./chroma_vectordb"):
            vectordb = Chroma(persist_directory="./chroma_vectordb", embedding_function=embedding)
        else:
            vectordb = Chroma(collection_name="test",
                              embedding_function=embedding,
                              persist_directory="./chroma_vectordb")
        self.vectordb = vectordb
        return self.vectordb

    def add_documents(self, documents):
        self.vectordb.add_documents(documents)

    def as_retriever(self):
        retriever = self.vectordb.as_retriever()
        return retriever
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# Google Drive: [https://python.langchain.com/v0.2/docs/integrations/document_loaders/google_drive]
# BibTeX: [https://python.langchain.com/v0.2/docs/integrations/document_loaders/bibtex]
# ArxivLoader, []


files/UNIT-2.pptx


## Sample test

In [None]:
url = "https://www.youtube.com/watch?v=0AW6tWTRLeU"
uploader = ScanDocuments()
data = uploader.upload_url(url)
documents = uploader.process_document(data)

# -----------------------------------------------------------------------
# -----------------------------------------------------------------------
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")
model = Model(llm, embedding)

# -----------------------------------------------------------------------
# -----------------------------------------------------------------------
vector_store = VectorStore().create_vector_store(model.embeddings)
vector_store.add_documents(documents)
retriever = vector_store.as_retriever()


In [None]:
query = "What does Shayne say about Garlic Naan"

qa_chain = RetrievalQA.from_chain_type(
    llm = model.llm,
    chain_type = "stuff",
    retriever = retriever,
    return_source_documents = True,
)
qa_chain.invoke(query)

In [None]:
from langchain_core.documents import Document

base_description_doc = documents[0]

title = base_description_doc.metadata.get("title")
description = base_description_doc.metadata.get("description")
author = base_description_doc.metadata.get("author")
date = base_description_doc.metadata.get("date")
view_count = base_description_doc.metadata.get("view_count")

summary_doc = Document(
    metadata = base_description_doc.metadata,
    page_content = f"""This is a Youtube video Titled: {title}.
    This video was created by the Channel {author} on {date}. 
    The video has {view_count} views. 
    The Description of the video is: {description}""",
)
summary_doc

new_documents = [summary_doc] + documents
new_documents

## Search

In [3]:
import requests
from langchain_core.documents import Document
jina_search_url = "https://s.jina.ai/"
search_query = 'What is the Temperature in Jaipur Today?'
response = requests.get(jina_search_url+search_query)

uploader = ScanDocuments()
text = Document(metadata={'source': '0AW6tWTRLeU'},page_content=str(response.text))
documents = uploader.process_document([text])
documents

 Document(metadata={'source': '0AW6tWTRLeU'}, page_content='*   [India Today](https://www.indiatoday.in/ "India Today")\n*   [Aaj Tak](https://www.aajtak.in/ "Aaj Tak")\n*   [GNTTV](https://www.gnttv.com/ "GNTTV")\n*   [Lallantop](https://www.thelallantop.com/ "Lallantop")\n*   [Business Today](https://www.businesstoday.in/ "Business Today")\n*   [Bangla](https://bangla.aajtak.in/ "Bangla")\n*   [Malayalam](https://malayalam.indiatoday.in/ "Malayalam")\n*   [Northeast](https://www.indiatodayne.in/ "Northeast")\n*   [BT Bazaar](https://bazaar.businesstoday.in/ "BT Bazaar")\n*   [Harper\'s Bazaar](https://www.harpersbazaar.in/ "Harper\'s Bazaar")\n*   [Sports Tak](https://thesportstak.com/ "Sports Tak")\n*   [Crime Tak](https://www.crimetak.in/ "Crime Tak")\n*   [Astro Tak](https://www.astrotak.com/ "Astro Tak")\n*   [Gaming](https://www.indiatodaygaming.com/ "Gaming")\n*   [Brides Today](https://www.bridestoday.in/ "Brides Today")\n*   [Cosmopolitan](https://www.cosmopolitan.in/ "Cosmop

In [None]:
# -----------------------------------------------------------------------
# -----------------------------------------------------------------------
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")
model = Model(llm, embedding)

# -----------------------------------------------------------------------
# -----------------------------------------------------------------------
vector_store = VectorStore().create_vector_store(model.embeddings)
vector_store.add_documents(documents)
retriever = vector_store.as_retriever()

In [4]:
qa_chain = RetrievalQA.from_chain_type(
    llm = model.llm,
    chain_type = "stuff",
    retriever = retriever,
    return_source_documents = True,
)
qa_chain.invoke(search_query)

NameError: name 'model' is not defined

In [None]:
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI 

# documents =
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


vectordb = Chroma.from_documents(documents, 
                                 embedding=embedding,
                                 persist_directory="chroma"
                                 )

retriever = vectordb.as_retriever(
    search_type = "similarity",
    search_kwargs = {'k':3})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectordb.as_retriever(),
    # return_source_documents = True,
)

query = "What is video about?"
docs = retriever.get_relevant_documents(query)
qa_chain.invoke(query, documents = docs)

In [None]:
# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------
# from langchain.vectorstores import Chroma
# from langchain.chains import RetrievalQA

# vectordb  = FAISS.from_documents(documents, embedding=embedding)
# qa_chain = RetrievalQA.from_chain_type(
#     llm = llm,
#     chain_type = "stuff",
#     retriever = vectordb.as_retriever(),
# )

# query = "What is the main idea of the paper? WHat are the math formulas used in this paper"	
# results = qa_chain.invoke(query)
# vectordb.save_local("vectordb")


# vectordb.delete_collection()
# qa_chain.combine_documents_chain_.llm_chain.prompt.messages[0].prompt.template

In [None]:
from langchain_community.document_loaders import YoutubeLoader

youtube_url = "https://www.youtube.com/watch?v=QsYGlZkevEg"
loader = YoutubeLoader.from_youtube_url(youtube_url,add_video_info=True)
# transcript = loader.load()
# transcript


In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')