# idea

1. google search
2. traifatura web page
3. split web page in chunks 
4. embedd web page
5. simiarity web page
6. q&a llm 



In [None]:
!pip install sentence-transformers trafilatura

In [2]:
import os 
os.environ = {**os.environ, **{env.split("=")[0]: env.split("=")[1].strip() for env in open(".env", "r").readlines()}}

In [79]:
from langchain.chains.base import Chain
from langchain.utilities import GoogleSearchAPIWrapper
from trafilatura import fetch_url, extract
from trafilatura.settings import ConfigParser, use_config
from typing import Dict, List
from langchain.text_splitter import CharacterTextSplitter
from transformers import AutoTokenizer
from langchain.schema import Document
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np 

from duckduckgo_search import ddg

def dgg_search(query, region='en-us', safesearch='On', time='m', num_results=3):
    results = ddg(query, region=region, safesearch=safesearch, time=time, max_results=num_results)
    return results


class SearchWebsiteChainWithEncoder(Chain):
    # search_engine = GoogleSearchAPIWrapper
    top_k_search_results = 3
    top_k_documents = 5
    website_search_timeout = 5
    website_retries=2
    website_language = "en"
    tokenizer_id = "bert-base-uncased"
    cross_encoder_model = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    chunk_size = 200
    
    @property
    def input_keys(self) -> List[str]:
        return ["query"]

    @property
    def output_keys(self) -> List[str]:
        return ['documents']

    # Pydantic doesn't support post_init hooks, so we need to do this manually
    @property
    def tokenizer(self) -> List[str]:
        return AutoTokenizer.from_pretrained(self.tokenizer_id)

    @property
    def text_splitter(self) -> List[str]:
        return CharacterTextSplitter.from_huggingface_tokenizer(self.tokenizer, separator = "\n", chunk_size=self.chunk_size, chunk_overlap=12)

    @property
    def trafilatura_config(self) -> List[str]:
        c = use_config()
        c.set('DEFAULT', 'DOWNLOAD_TIMEOUT', str(self.website_search_timeout))
        c.set('DEFAULT', 'USER_AGENTS', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582")
        return c

    @property
    def cross_encoder(self) -> CrossEncoder:
        return CrossEncoder(self.cross_encoder_model)
    
    def get_html_data(self ,url):        
        downloaded = fetch_url(url, config=self.trafilatura_config)
        return extract(downloaded,target_language=self.website_language, include_comments=False, include_formatting=False, include_tables=False, with_metadata=False)

    def split_content_into_documents(self, content: List[Dict[str,str]]) -> List[str]:
        metadata = [{"source": s["href"]} for s in content]
        # metadata = [{"source": s["link"]} for s in content]
        documents = [s["content"] for s in content]
        texts = self.text_splitter.create_documents(documents,metadatas=metadata)
        return texts

    def score_documents(self,query:str ,documents: List[Document]) -> List[Dict[str,str]]:
        # So we create the respective sentence combinations
        sentence_combinations = [[query, corpus_sentence.page_content] for corpus_sentence in documents]

        # Compute the similarity scores for these combinations
        similarity_scores = self.cross_encoder.predict(sentence_combinations)
        # Sort the scores in decreasing order
        sim_scores_argsort = reversed(np.argsort(similarity_scores))
        # get top 2 results 
        top_results = []
        for idx in list(sim_scores_argsort)[:self.top_k_documents]:
            x = documents[idx]
            x.metadata["score"] = similarity_scores[idx]
            top_results.append(x)
        return top_results

    def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
        # define search query
        query = inputs.get("query")
        
        # search using search engine
        # search_results = self.search_engine.results(query,num_results=self.top_k_search_results)
        search_results = dgg_search(query,num_results=self.top_k_search_results)
        # extract html data from search results
        results_with_content = []
        for result in search_results:
            html = self.get_html_data(result["href"])
            # html = self.get_html_data(result["link"])
            if html: 
                results_with_content.append({**result, **{"content": html}})
        
        # convert search results into documents
        documents = self.split_content_into_documents(results_with_content)
        
        # score documents
        scored_documents = self.score_documents(query,documents)
        
        # search.results(query,num_results=3)
        # output_1 = self.chain_1.run(inputs)
        # output_2 = self.chain_2.run(inputs)
        return {'documents': scored_documents}

In [80]:
x = SearchWebsiteChainWithEncoder()

In [81]:
x.run({"query": "What is the capital of Germany?"})

[{'title': 'Germany | Facts, Geography, Maps, & History | Britannica', 'href': 'https://www.britannica.com/place/Germany', 'body': "Germany, officially Federal Republic of Germany, German Deutschland or Bundesrepublik Deutschland, country of north-central Europe, traversing the continent's main physical divisions, from the outer ranges of the Alps northward across the varied landscape of the Central German Uplands and then across the North German Plain. ... is the capital ..."}, {'title': 'Berlin | History, Map, Population, Attractions, & Facts', 'href': 'https://www.britannica.com/place/Berlin', 'body': "Berlin, capital and chief urban centre of Germany. The city lies at the heart of the North German Plain, athwart an east-west commercial and geographic axis that helped make it the capital of the kingdom of Prussia and then, from 1871, of a unified Germany. Berlin's former glory ended in 1945, but the city survived the destruction of World War II."}, {'title': 'Germany | Culture, Fact

Created a chunk of size 248, which is longer than the specified 200
Created a chunk of size 232, which is longer than the specified 200
Created a chunk of size 221, which is longer than the specified 200
Created a chunk of size 244, which is longer than the specified 200


[Document(page_content='Berlin\nOur editors will review what you’ve submitted and determine whether to revise the article.\nRecent News\nWhere is Berlin located?\nWhy was the Berlin Wall built around West Berlin?\nWhat type of climate does Berlin have?\nBerlin is famous for what cultural institutions?\nBerlin is the capital of what country?\nBerlin, capital and chief urban centre of Germany. The city lies at the heart of the North German Plain, athwart an east-west commercial and geographic axis that helped make it the capital of the kingdom of Prussia and then, from 1871, of a unified Germany. Berlin’s former glory ended in 1945, but the city survived the destruction of World War II. It was rebuilt and came to show amazing economic and cultural growth.', metadata={'source': 'https://www.britannica.com/place/Berlin', 'score': 5.5265813}),
 Document(page_content='One of Europe’s largest countries, Germany encompasses a wide variety of landscapes: the tall, sheer mountains of the south; 

In [82]:
from langchain.llms import HuggingFaceHub
import os 

llm = HuggingFaceHub(repo_id="OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5", 
                     huggingfacehub_api_token=os.environ["HF_API_KEY"], 
                     model_kwargs={"do_sample": True, "max_new_tokens": 512, "top_p":0.6, "temperature":0.6 })

# llm = HuggingFaceHub(repo_id="google/flan-ul2", 
#                      huggingfacehub_api_token=os.environ["HF_API_KEY"], 
#                      model_kwargs={"do_sample": True, "max_new_tokens": 512, "top_p":0.9, "temperature":0.3 })



In [83]:
from langchain.chains import LLMChain
from langchain.chains.base import Chain
from typing import Any, Dict, List
from langchain import PromptTemplate

oa_prompt = PromptTemplate(
  input_variables=["context", "question"],
  template="<|prompter|>Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:<|endoftext|><|assistant|>"
)


class LLMWebSearchChain(Chain):
    llm: Any
    websearch = SearchWebsiteChainWithEncoder()
    prompt: PromptTemplate = oa_prompt

    @property
    def input_keys(self) -> List[str]:
        return ["query"]

    @property
    def output_keys(self) -> List[str]:
        return ['answer']

    def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
        search_result = self.websearch.run(inputs)
        format_prompt = self.prompt.format(context="\n".join([p.page_content for p in search_result]), question=inputs["query"])
        
        answer = self.llm(format_prompt)
        return {'answer': answer}

In [84]:

x = SearchWebsiteChainWithEncoder(top_k_search_results=3,top_k_documents=5)
search = LLMWebSearchChain(llm=llm, websearch=x)

In [85]:
from langchain import OpenAI
from langchain.llms import Anthropic


# llm = OpenAI(temperature=0)
# llm = Anthropic(temperature=0)

In [86]:
query="How tall can brown bears become?"

search.run(query)

[{'title': 'Grizzly bear | Weight, Habitat, & Facts | Britannica', 'href': 'https://www.britannica.com/animal/grizzly-bear', 'body': 'Large adult grizzlies may be about 2.5 metres (8 feet) long and weigh about 410 kg (900 pounds). Because of their bulk and their long straight claws, grizzly bears rarely climb trees. They are, however, surprisingly agile and can run as fast as 56 km (35 miles) per hour over short distances.'}, {'title': 'Grizzly bear - Wikipedia', 'href': 'https://en.wikipedia.org/wiki/Grizzly_bear', 'body': 'In preparation for winter, bears can gain approximately 180 kg (400 lb), during a period of hyperphagia, before going into hibernation. [43] The bear often waits for a substantial snowstorm before it enters its den: such behavior lessens the chances that predators will find the den.'}, {'title': 'Bear Animal Facts | Ursidae - AZ Animals', 'href': 'https://a-z-animals.com/animals/bear/', 'body': 'Bear Physical Characteristics Color Brown Black White Skin Type Fur To

Created a chunk of size 215, which is longer than the specified 200
Created a chunk of size 213, which is longer than the specified 200
Created a chunk of size 363, which is longer than the specified 200
Created a chunk of size 390, which is longer than the specified 200
Created a chunk of size 264, which is longer than the specified 200
Created a chunk of size 206, which is longer than the specified 200
Created a chunk of size 216, which is longer than the specified 200
Created a chunk of size 201, which is longer than the specified 200
Created a chunk of size 207, which is longer than the specified 200
Created a chunk of size 283, which is longer than the specified 200
Created a chunk of size 250, which is longer than the specified 200
Created a chunk of size 492, which is longer than the specified 200


'The average total length in this subspecies is between 198 cm (78 in) and 240 cm (94 in), with an average shoulder height of 102 cm (40 in) and hindfoot length of 28 cm (11 in). Newborn bears may weigh less than 500 g (18 oz).'

In [87]:
x.run(query)

[{'title': 'Grizzly bear | Weight, Habitat, & Facts | Britannica', 'href': 'https://www.britannica.com/animal/grizzly-bear', 'body': 'Large adult grizzlies may be about 2.5 metres (8 feet) long and weigh about 410 kg (900 pounds). Because of their bulk and their long straight claws, grizzly bears rarely climb trees. They are, however, surprisingly agile and can run as fast as 56 km (35 miles) per hour over short distances.'}, {'title': 'Grizzly bear - Wikipedia', 'href': 'https://en.wikipedia.org/wiki/Grizzly_bear', 'body': 'In preparation for winter, bears can gain approximately 180 kg (400 lb), during a period of hyperphagia, before going into hibernation. [43] The bear often waits for a substantial snowstorm before it enters its den: such behavior lessens the chances that predators will find the den.'}, {'title': 'American black bear - Wikipedia', 'href': 'https://en.wikipedia.org/wiki/American_black_bear', 'body': "The American black bear (Ursus americanus), also known as the black

Created a chunk of size 215, which is longer than the specified 200
Created a chunk of size 213, which is longer than the specified 200
Created a chunk of size 363, which is longer than the specified 200
Created a chunk of size 390, which is longer than the specified 200
Created a chunk of size 264, which is longer than the specified 200
Created a chunk of size 206, which is longer than the specified 200
Created a chunk of size 216, which is longer than the specified 200
Created a chunk of size 201, which is longer than the specified 200
Created a chunk of size 207, which is longer than the specified 200
Created a chunk of size 283, which is longer than the specified 200
Created a chunk of size 250, which is longer than the specified 200
Created a chunk of size 492, which is longer than the specified 200
Created a chunk of size 251, which is longer than the specified 200
Created a chunk of size 201, which is longer than the specified 200
Created a chunk of size 297, which is longer tha

[Document(page_content='The populations in northern interior Canada are much smaller, with males weighing 139 kilograms (306 lb) and females weighing 95 kilograms (209 lb).This is actually similar to the American black bear population of the area.[19]\nAverage total length in this subspecies is between 198 cm (78 in) and 240 cm (94 in),[20] with an average shoulder height of 102 cm (40 in) and hindfoot length of 28 cm (11 in).[21] Newborn bears may weigh less than 500 g (18 oz).\nCharacteristics\nAlthough variable in color from blond to nearly black, grizzly bear fur is typically brown with darker legs and commonly white or blond tipped fur on the flank and back.[22]\nGrizzly bears overlap with Black Bears in range, but there are numerous factors that can differentiate the two.', metadata={'source': 'https://en.wikipedia.org/wiki/Grizzly_bear', 'score': 0.39579847}),
 Document(page_content='Coastal grizzlies, often referred to by the popular but geographically redundant synonym of "bro

In [88]:

llm(f"<|prompter|>{query}<|endoftext|><|assistant|>")

'Brown bears can grow up to 6 feet (2 meters) tall.'