In [None]:
!pip install google
!pip install beautifulsoup4

In [5]:
from googlesearch import search
import requests
import re
import os
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [2]:
async def scraper(url):
    #  Scrapy Selenium
    def removeElement(soup,element: str):
        elements = soup.find_all(element)
        if len(elements)==0:
            return soup
        for div in elements:
            div.decompose()
        return soup

    def removeElementsWithClass(soup,class_:str):
        class_regex = re.compile(rf"{class_}")
        elements = soup.find_all(class_=class_regex)
        if len(elements)>0:
            for tag in elements:
                tag.decompose()
        element = soup.find(id=class_regex)
        if element:
            element.decompose()
        return soup

    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve webpage")
        return

    soup = BeautifulSoup(response.content, 'html.parser').find('body')
    soup = removeElement(soup,'nav')
    soup = removeElement(soup,'footer')
    soup = removeElementsWithClass(soup,'menu')
    soup = removeElementsWithClass(soup,'nav')
    soup = removeElementsWithClass(soup,'footer')
    # for heading in soup.find_all(['h1','h2']):
    #     print(heading)
    # for para in soup.find_all('p'):
    #     print(para)
    return soup

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

In [4]:
def createChunking(website):
    splitter = RecursiveCharacterTextSplitter.from_language(chunk_size=1024, chunk_overlap=30,language=Language.PYTHON)
    chunks = splitter.split_documents(website)
    print(chunks)
    return chunks


In [5]:
top_k_websites = 2

In [6]:
query = input('Enter your query')

In [7]:

response = search(query=query,num=top_k_websites,stop=top_k_websites)
pages = []
for i, result in enumerate(response, start=1):
    pages.append(await scraper(result))

https://www.elastic.co/what-is/large-language-models
https://www.elastic.co/what-is/natural-language-processing


In [7]:
DATA_PATH = 'scraped.txt'

In [68]:
with open(DATA_PATH,'w') as file:
    for page in pages:
        file.write(page.text)

In [6]:
API_KEY = os.environ['OPENAI_API_KEY']

In [9]:
doc = TextLoader(DATA_PATH).load()

In [10]:
doc

[Document(page_content='Skip to main contentWhat is a large language model (LLM)?Explore popular open-source LLMsLarge language model definitionA large language model (LLM) is a deep learning algorithm that can perform a variety of natural language processing (NLP) tasks. Large language models use transformer models and are trained using massive datasets — hence, large. This enables them to recognize, translate, predict, or generate text or other content.Large language models are also referred to as neural networks (NNs), which are computing systems inspired by the human brain. These neural networks work using a network of nodes that are layered, much like neurons.In addition to teaching human languages to artificial intelligence (AI) applications, large language models can also be trained to perform a variety of tasks like understanding protein structures, writing software code, and more. Like the human brain, large language models must be pre-trained and then fine-tuned so that they 

In [11]:
embedder = OpenAIEmbeddings(api_key=API_KEY)

In [None]:
chunks = createChunking(doc)

In [None]:
chunks

In [14]:
db = Chroma.from_documents(chunks, embedder,persist_directory='./database')

In [15]:
db.persist()

In [None]:
db = Chroma(persist_directory=DATA_PATH,embedding_function=embedder)

In [16]:
context = db.similarity_search_with_relevance_scores(query,k=5)

In [17]:
print(context)

[(Document(page_content='Skip to main contentWhat is a large language model (LLM)?Explore popular open-source LLMsLarge language model definitionA large language model (LLM) is a deep learning algorithm that can perform a variety of natural language processing (NLP) tasks. Large language models use transformer models and are trained using massive datasets — hence, large. This enables them to recognize, translate, predict, or generate text or other content.Large language models are also referred to as neural networks (NNs), which are computing systems inspired by the human brain. These neural networks work using a network of nodes that are layered, much like neurons.In addition to teaching human languages to artificial intelligence (AI) applications, large language models can also be trained to perform a variety of tasks like understanding protein structures, writing software code, and more. Like the human brain, large language models must be pre-trained and then fine-tuned so that they

In [18]:
from langchain.prompts import ChatPromptTemplate
prompt_template = ChatPromptTemplate.from_template(
"""
Answer the question based only on the following context:
{context}

Answer the question based only on the above context: {query}
""")    

In [19]:
prompt = prompt_template.format(context=context,query=query)

In [None]:
print(prompt)

In [21]:
model = ChatOpenAI(api_key=API_KEY)

In [22]:
result = model.predict(prompt)

  warn_deprecated(


In [23]:
print(result)

A large language model (LLM) is a deep learning algorithm that can perform a variety of natural language processing (NLP) tasks. Large language models use transformer models and are trained using massive datasets, enabling them to recognize, translate, predict, or generate text or other content. They are also referred to as neural networks (NNs) and can be trained to perform tasks beyond language processing, such as understanding protein structures or writing software code.
