### Install Dependencies

In [1]:
%pip install --q langchain-community
%pip install --q langchain
%pip install --q duckduckgo-search

# TODO: add the following packages to poetry
%pip install --q requests
%pip install --q beautifulsoup4

#################################
# Required for PaperSpace Gradient
%pip install --q typing-inspect==0.8.0 typing_extensions==4.5.0
%pip install --q pydantic==1.10.8

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytest 7.2.1 requires attrs>=19.2.0, but you have attrs 18.2.0 which is incompatible.
gradient 2.0.6 requires marshmallow<3.0, but you have marshmallow 3.21.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradient 2.0.6 requires marshmallow<3.0, but you have marshmallow 3.21.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated pack

### Setup Ollama

In [6]:
# %> curl -fsSL https://ollama.com/install.sh | sh
# %> ollama serve
# %> ollama pull nomic-embed-text     ???? do we need a retriever???
# %> ollama pull mixtral:instruct     (gemma:7b-instruct | mistral:instruct)

!ollama list

NAME                   	ID          	SIZE  	MODIFIED      
mixtral:instruct       	7708c059a8bb	26 GB 	6 seconds ago	
nomic-embed-text:latest	0a109f422b47	274 MB	4 minutes ago	


### Accelerator Info

In [7]:
!nvidia-smi

Tue Feb 27 22:04:19 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:00:05.0 Off |                  Off |
| 30%   46C    P8    21W / 300W |      1MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Prompt

In [8]:
template = """{text}
-----------
Using the above text, answer in short the following question:
> {question}
-----------
if the question cannot be answered using the text, imply summarize the text. Include all
factual information, numbers, stats etc if available.
"""

In [9]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(template)

### Ingest Documents

In [10]:
import requests
from bs4 import BeautifulSoup

def scrape_url(url: str):
    try:
        # set a get request to webpage
        response = requests.get(url)

        # if the request response was successful
        if (response.status_code == 200):
            # parse the content of the request
            soup = BeautifulSoup(response.text, "html.parser")
            text_content = soup.get_text(separator=" ", strip=True)
            return text_content
        else:
            return f"Failed to retrieve page: {response.status_code}"
    except Exception as e:
        print(e)    

    return ""

In [11]:
url = "https://blog.langchain.dev/announcing-langsmith/"

content = scrape_url(url)[:10000]

### Setup Local LLM & Embedding Models

In [12]:
LLM_MODEL = "mixtral:instruct"  # ("gemma:7b-instruct" | "mistral:instruct")
EMBEDDING_MODEL = "nomic-embed-text"
TEMPERATURE = 0.9
ENABLE_TRACING = False
DOCUMENT_CHUNK_SIZE = 7500  # Gemma --> DOCUMENT_CHUNK_SIZE=5000
CHUNK_OVERLAP = 100

#### Load & Test Local LLM Model

#### Testing the Ollama model. No need to import separate llm model.

In [13]:
from langchain.llms import Ollama

_test_llm_model_ = Ollama(
    model=LLM_MODEL,
    temperature=TEMPERATURE,
)

_test_llm_model_("Who are you?")

  warn_deprecated(


' I am a language model trained by the Mistral AI team. I generate text based on the input I receive, and my purpose is to provide useful, safe, and accurate information.'

### Web Search Agent

In [14]:
from langchain.utilities import DuckDuckGoSearchAPIWrapper

NUM_RESULTS_PER_QUESTION = 3
ddg_search = DuckDuckGoSearchAPIWrapper()

async def web_search(query: str, num_results: int = NUM_RESULTS_PER_QUESTION):
    web_query_results = ddg_search.results(query, num_results)
    return [r["link"] for r in web_query_results]


In [17]:
await web_search("What is langsmith?")

['https://blog.langchain.dev/announcing-langsmith/',
 'https://blog.logrocket.com/langsmith-test-llms-ai-applications/',
 'https://dev.to/logankilpatrick/what-is-langsmith-and-why-should-i-care-as-a-developer-19k']

### Setup Chat Chain

In [85]:
import asyncio
from langchain_community.chat_models import ChatOllama
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

scrape_and_summarize_chain = (
    RunnablePassthrough.assign(text=lambda x: scrape_url(x["url"])[:10000])
    | prompt
    | ChatOllama(model=LLM_MODEL)
    | StrOutputParser()
)

web_search_chain = (
    RunnablePassthrough.assign(urls=lambda x: asyncio.run(web_search(x["question"])))
    | (lambda x: [{"question": x["question"], "url": u} for u in x["urls"]])
    | scrape_and_summarize_chain.map()
)


chat_response = web_search_chain.invoke({"question": "What is LangSmith?"})
chat_response

[' LangSmith is a platform that allows developers to create and evaluate language models. It provides an API for generating responses based on descriptions, and also offers tools like serpapi and llm-math. LangSmith uses the LangChain library to initialize a chat model, load specific tools, and create an agent that can generate responses based on descriptions. The platform also provides a development environment for setting up and running the agent, including the generation of API keys and secure storage. Additionally, LangSmith offers a waitlist for users to join and access these features.',
 " LangSmith is an open-source debugging, testing, and monitoring platform for Language Model applications built on top of LangChain's framework. It provides a unified hub for developers to manage their LLM applications throughout the development process, from prototyping to production. LangSmith offers tools for debugging, such as tracing and visualizing agent prompt chains, testing with custom d

#### Test Web Search

In [86]:
questions_generation = (
    "Write 3 google search queries to search online that form an objective "
    "opinion from the following question: {question}\n "
    "Return a python list in the following format: "
    "['query 1', 'query 2', 'query 3']\n "
    "Do not include anything else after the list."
)

SEARCH_PROMPT = ChatPromptTemplate.from_messages([("user", questions_generation)])

search_qanda_chain = (
    SEARCH_PROMPT
    | ChatOllama(model=LLM_MODEL)
    | StrOutputParser()
    | eval
    | (lambda l: [{"question": q} for q in l])
)

chain = search_qanda_chain | web_search_chain.map()

lol = chain.invoke({"question": "what is json?"})

### Report Writer Chain

In [82]:
def collapse_list_of_lists(list_of_lists):
    content = []
    for l in list_of_lists:
        content.append("\n\n".join(l))
    return "\n\n".join(content)

In [87]:
collapse_list_of_lists(lol)

' JSON (JavaScript Object Notation) is a lightweight data interchange format that is easy for humans to read and write and easy for machines to parse and generate. It is a text format that is completely language independent but uses conventions that are familiar to programmers of the C-family of languages, including C, C++, C#, Java, JavaScript, Perl, Python, and many others. JSON is built on two structures:\n\n1. A collection of name/value pairs. In various languages, this is realized as an object, record, struct, dictionary, hash table, keyed list, or associative array.\n2. An ordered list of values. In most languages, this is realized as an array, vector, list, or sequence.\n\nJSON is a subset of the JavaScript Programming Language, Standard ECMA-262 3rd Edition - December 1999. JSON is a text format that is completely language independent but uses conventions that are familiar to programmers of the C-family of languages, including C, C++, C#, Java, JavaScript, Perl, Python, and man

In [81]:
WRITER_SYSTEM_PROMPT = (
    "You are an AI critical thinker research assistant. Your sole purpose "
    "is to write well written, critically acclaimed, objective and structured "
    "reports on given text."
)

RESEARCH_REPORT_TEMPLATE = """Information:
--------
{research_summary}
--------
Using the above information, answer the following question or topic: "{question}" 
in a detailed report -- The report should focus on the answer to the question, 
should be well structured, informative,
in depth, with facts and numbers if available and a minimum of 1,200 words.
You should strive to write the report as long as you can using all relevant 
and necessary information provided.
You must write the report with markdown syntax.
You MUST determine your own concrete and valid opinion based on the given 
information. Do NOT deter to general and meaningless conclusions.
Write all used source urls at the end of the report, and make sure to not 
add duplicated sources, but only one reference for each.
You must write the report in apa format.
Please do your best, this is very important to my career."""

research_writer_prompt = ChatPromptTemplate.from_messages(
    [("system", WRITER_SYSTEM_PROMPT), ("user", RESEARCH_REPORT_TEMPLATE)]
)

