# idea

1. google search
2. traifatura web page
3. split web page in chunks 
4. embedd web page
5. simiarity web page
6. q&a llm 



In [None]:
!pip install sentence-transformers trafilatura

In [2]:
import os 
os.environ = {**os.environ, **{env.split("=")[0]: env.split("=")[1].strip() for env in open(".env", "r").readlines()}}

In [42]:
from langchain.chains.base import Chain
from langchain.utilities import GoogleSearchAPIWrapper
from trafilatura import fetch_url, extract
from trafilatura.settings import ConfigParser, use_config
from typing import Dict, List
from langchain.text_splitter import CharacterTextSplitter
from transformers import AutoTokenizer
from langchain.schema import Document
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np 


class SearchWebsiteChainWithEncoder(Chain):
    search_engine = GoogleSearchAPIWrapper()
    top_k_search_results = 3
    top_k_documents = 5
    website_search_timeout = 5
    website_retries=2
    website_language = "en"
    tokenizer_id = "bert-base-uncased"
    cross_encoder_model = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    
    @property
    def input_keys(self) -> List[str]:
        return ["query"]

    @property
    def output_keys(self) -> List[str]:
        return ['documents']

    # Pydantic doesn't support post_init hooks, so we need to do this manually
    @property
    def tokenizer(self) -> List[str]:
        return AutoTokenizer.from_pretrained(self.tokenizer_id)

    @property
    def text_splitter(self) -> List[str]:
        return CharacterTextSplitter.from_huggingface_tokenizer(self.tokenizer, separator = "\n", chunk_size=100, chunk_overlap=12)

    @property
    def trafilatura_config(self) -> List[str]:
        c = use_config()
        c.set('DEFAULT', 'DOWNLOAD_TIMEOUT', str(self.website_search_timeout))
        c.set('DEFAULT', 'USER_AGENTS', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582")
        return c

    @property
    def cross_encoder(self) -> CrossEncoder:
        return CrossEncoder(self.cross_encoder_model)
    
    def get_html_data(self ,url):        
        downloaded = fetch_url(url, config=self.trafilatura_config)
        return extract(downloaded,target_language=self.website_language, include_comments=False, include_formatting=False, include_tables=False, with_metadata=False)

    def split_content_into_documents(self, content: List[Dict[str,str]]) -> List[str]:
        metadata = [{"source": s["link"]} for s in content]
        documents = [s["content"] for s in content]
        texts = self.text_splitter.create_documents(documents,metadatas=metadata)
        return texts

    def score_documents(self,query:str ,documents: List[Document]) -> List[Dict[str,str]]:
        # So we create the respective sentence combinations
        sentence_combinations = [[query, corpus_sentence.page_content] for corpus_sentence in documents]

        # Compute the similarity scores for these combinations
        similarity_scores = self.cross_encoder.predict(sentence_combinations)
        # Sort the scores in decreasing order
        sim_scores_argsort = reversed(np.argsort(similarity_scores))
        # get top 2 results 
        top_results = []
        for idx in list(sim_scores_argsort)[:self.top_k_documents]:
            x = documents[idx]
            x.metadata["score"] = similarity_scores[idx]
            top_results.append(x)
        return top_results

    def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
        # define search query
        query = inputs.get("query")
        
        # search using search engine
        search_results = self.search_engine.results(query,num_results=self.top_k_search_results)
        # extract html data from search results
        results_with_content = []
        for result in search_results:
            html = self.get_html_data(result["link"])
            if html: 
                results_with_content.append({**result, **{"content": html}})
        
        # convert search results into documents
        documents = self.split_content_into_documents(results_with_content)
        
        # score documents
        scored_documents = self.score_documents(query,documents)
        
        # search.results(query,num_results=3)
        # output_1 = self.chain_1.run(inputs)
        # output_2 = self.chain_2.run(inputs)
        return {'documents': scored_documents}

In [43]:
x = SearchWebsiteChainWithEncoder()

In [44]:
from langchain.llms import HuggingFaceHub
import os 

llm = HuggingFaceHub(repo_id="OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5", 
                     huggingfacehub_api_token=os.environ["HF_API_KEY"], 
                     model_kwargs={"do_sample": True, "max_new_tokens": 512, "top_p":0.6, "temperature":0.6 })

# llm = HuggingFaceHub(repo_id="google/flan-ul2", 
#                      huggingfacehub_api_token=os.environ["HF_API_KEY"], 
#                      model_kwargs={"do_sample": True, "max_new_tokens": 512, "top_p":0.9, "temperature":0.3 })



In [45]:
from langchain.chains import LLMChain
from langchain.chains.base import Chain
from typing import Any, Dict, List
from langchain import PromptTemplate

oa_prompt = PromptTemplate(
  input_variables=["context", "question"],
  template="<|prompter|>Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:<|endoftext|><|assistant|>"
)


class LLMWebSearchChain(Chain):
    llm: Any
    websearch = SearchWebsiteChainWithEncoder()
    prompt: PromptTemplate = oa_prompt

    @property
    def input_keys(self) -> List[str]:
        return ["query"]

    @property
    def output_keys(self) -> List[str]:
        return ['answer']

    def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
        search_result = self.websearch.run(inputs)
        format_prompt = self.prompt.format(context="\n".join([p.page_content for p in search_result]), question=inputs["query"])
        
        answer = self.llm(format_prompt)
        return {'answer': answer}

In [57]:

x = SearchWebsiteChainWithEncoder(top_k_search_results=2,top_k_documents=5)
search = LLMWebSearchChain(llm=llm, websearch=x)

In [58]:
from langchain import OpenAI
from langchain.llms import Anthropic


# llm = OpenAI(temperature=0)
# llm = Anthropic(temperature=0)

In [59]:
query="What is the current stock market value of the dax?"

search.run(query)

Created a chunk of size 115, which is longer than the specified 100
Created a chunk of size 104, which is longer than the specified 100
Created a chunk of size 106, which is longer than the specified 100


'The current stock market value of the DAX is around 10,9,000 points.'

In [60]:
x.run(query)

Created a chunk of size 115, which is longer than the specified 100
Created a chunk of size 104, which is longer than the specified 100
Created a chunk of size 106, which is longer than the specified 100


[Document(page_content='The performance of the insurance sector was significantly better. The STOXX Europe 600 Insurance traded almost unchanged (-1.0%). The Allianz share recorded a slight fall of 3.3% to 200.90 euros. Including the dividend of 10.80 euros, the increase in value was 2.0%. In a ten-year comparison, the average annual increase in value was 11.8%.', metadata={'source': 'https://www.allianz.com/en/investor_relations/share/share-price.html', 'score': -6.3316245}),
 Document(page_content='It’s useful to look at stock market levels compared to where they’ve been over the past few months. When the S&P 500 is above its moving or rolling average of the prior 125 trading days, that’s a sign of positive momentum. But if the index is below this average, it shows investors are getting skittish. The Fear & Greed Index uses slowing momentum as a signal for Fear and a growing momentum for Greed.', metadata={'source': 'https://www.cnn.com/markets/fear-and-greed', 'score': -10.157162}),

In [61]:

llm(f"<|prompter|>{query}<|endoftext|><|assistant|>")

'The current stock market value of the Dow Jones Industrial Average (DJIA) is around 26,000 points.'