In [1]:
!pip -q install langchain==0.0.333 openai==1.2.2
!pip -q install duckduckgo-search
!pip install python-dotenv



Setting up some keys

In [2]:
!pip show langchain

Name: langchain
Version: 0.0.333
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, anyio, async-timeout, dataclasses-json, jsonpatch, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 



# Custom Tools & Agents 🤖

In [3]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv('.env')

# Retrieve the API key
openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key is not None and len(openai_api_key) != 0:
    print("API key retrieved successfully.")
else:
    print("API key not found.")

API key retrieved successfully.


In [4]:
# from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

In [5]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106") # 16k tokens sent, 4k tokens received

## Standard Tool

In [6]:
from langchain.tools import DuckDuckGoSearchRun, BaseTool
from langchain.agents import initialize_agent, AgentType, Tool, AgentExecutor
from langchain.document_loaders import RecursiveUrlLoader
from langchain.chains import LLMMathChain
from bs4 import BeautifulSoup as bs
import re
import requests

search = DuckDuckGoSearchRun()
llm_math_chain = LLMMathChain.from_llm(llm=turbo_llm, verbose=True)

In [7]:
# class WebPageTool(BaseTool):
#     name = "Get Webpage"
#     description = "Useful for when you need to get the content from a specific webpage"

#     def _run(self, webpage: str):
#         response = requests.get(webpage)
#         html_content = response.text

#         def strip_html_tags(html_content):
#             soup = bs(html_content, "html.parser")
#             stripped_text = soup.get_text()
#             return stripped_text

#         stripped_content = strip_html_tags(html_content)
#         if len(stripped_content) > 4000:
#             stripped_content = stripped_content[:4000]
#         return stripped_content

#     def _arun(self, webpage: str):
#         raise NotImplementedError("This tool does not support async")

# page_getter = WebPageTool()

def extract_text_and_limit_tokens(html, token_limit=4097):
    # Extract text with BeautifulSoup
    text = bs(html, "html.parser").text
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text.strip())

    # Approximate tokenization by splitting on spaces
    tokens = text.split(' ')

    # Limit the number of tokens and join back into a string
    limited_text = ' '.join(tokens[:token_limit])
    return limited_text

def crawl_site(url):
    loader = RecursiveUrlLoader(
        url=url,
        max_depth=2,
        extractor=lambda x: extract_text_and_limit_tokens(x, 6000)
    )
    docs = loader.load()
    return docs

In [8]:
tools = [
    Tool(
        name = "search",
        func=search.run,
        description="Search the internet to find helpful websites."
    ),
    Tool(
        name="Calculator",
        func=llm_math_chain.run,
        description="Useful for when you need to answer questions about math",
    ),
    Tool(
        name="site_crawler",
        func=crawl_site,
        description="Crawl a website up to depth 2."
    ),
]

mrkl = initialize_agent(
    agent=AgentType.OPENAI_FUNCTIONS,
    tools=tools,
    llm=turbo_llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
)

In [9]:
fixed_prompt = '''You are a helpful AI assistant.''' # can change this later
mrkl.agent.prompt.messages[0].content = fixed_prompt
mrkl.agent.prompt.messages[0].content

'You are a helpful AI assistant.'

In [10]:
location = "Maple Street Guitars"
# result = mrkl.run(f"Search the internet about {location} and crawl at most 5 websites to find the necessary information about the price and what people usually do or order in there.")

In [11]:
# print(result)

In [12]:
# result

# Searching maps

In [13]:
from duckduckgo_search import DDGS

ddgs = DDGS()

# result = list(ddgs.maps('good restaurants around Atlanta, GA', max_results=50))

## List of recommended places

In [14]:
import threading
from openai import OpenAI

def get_recommended_places(latitude="33.771030", longitude= "-84.391090", radius=10): # defaults on north ave apartment
    # List of keys to keep
    keys_to_keep = ['title', 'address', 'latitude', 'longitude', 'phone', 'preference']
    # List of user preferences
    preferences_list = ['sports', 'art and culture', 'museum and history', 'food and dining', 'nature and outdoors', 'music', 'technology', 'shopping', 'movies and entertainment']
    # List of recommended places based on the user preferences
    recommended_places_list = []
    for preference in preferences_list:
        for original_dict in ddgs.maps(f"places related to {preference}", latitude=str(latitude), longitude= str(longitude), radius=radius, max_results=10):
            # Add the 'preference' key and value directly to the original dictionary
            original_dict['preference'] = preference
            recommended_places_list.append({k: original_dict[k] for k in keys_to_keep if k in original_dict})
    return recommended_places_list

def generate_information(place_name, client):
    prompt = f"""
    Based on what you know, generate about this place: {place_name}.
    Key information such as the environment and atmosphere of the place. If possible, estimate the range of the cost, and give some recommendations of what food people ordered or activities they did.
    Label them appropriately, and go to the next line for each detail.
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        max_tokens=200,
        temperature=0,
        messages=
        [
            {
                "role": "system",
                "content": "You are a helpful assistant.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],

    )
    return response.choices[0].message.content.strip()

def thread_worker(place, client):
    try:
        place['generated_info'] = generate_information(place['title'], client)
    except Exception as e:
        print(f"Error generating information for {place['title']}: {e}")

# generate information using threading
def process_places_concurrently(places, client):
    threads = []
    for place in places:
        t = threading.Thread(target=thread_worker, args=(place, client))
        threads.append(t)
        t.start()

    # Wait for all threads to complete
    for t in threads:
        t.join()

    return places

In [15]:
client = OpenAI(api_key=openai_api_key)

recommended_places_list = get_recommended_places(latitude="33.771030", longitude= "-84.391090", radius=10)
updated_recommended_places_list = process_places_concurrently(recommended_places_list, client)
updated_recommended_places_list

[{'title': 'Rodney Cook Sr. Park',
  'address': '609 Thurmond St NW, Atlanta, GA  30314, United States',
  'latitude': 33.7617605739047,
  'longitude': -84.4076693058014,
  'phone': '+14048810900',
  'preference': 'sports',
  'generated_info': "Rodney Cook Sr. Park is a beautiful urban park located in the historic Vine City neighborhood of Atlanta, Georgia. The park offers a peaceful and serene environment, with lush green spaces, walking trails, and a stunning water feature.\n\nAtmosphere: The atmosphere at Rodney Cook Sr. Park is relaxed and inviting, making it a perfect place for a leisurely stroll, a picnic, or simply enjoying the natural surroundings.\n\nCost: The park is free to enter and enjoy, making it an affordable option for a day out in Atlanta.\n\nFood: Visitors to Rodney Cook Sr. Park often bring picnic lunches to enjoy in the park's designated picnic areas. Popular food choices include sandwiches, salads, and fresh fruit.\n\nActivities: Common activities at the park incl

## Generating information concurrently

## Websites result from place

In [16]:
detailed_result_list = []
for r in ddgs.text('R. Thomas Deluxe Grill', max_results=10):
    detailed_result_list.append(r)
detailed_result_list

[{'title': 'R. Thomas Deluxe Grill',
  'href': 'https://www.rthomasdeluxegrill.net/',
  'body': 'R. Thomas Deluxe Grill R. Thomas Deluxe Grill in Atlanta, GA. Food for everyone! We use fresh vegetables, fruit and meat across our menu. We have something yummy for every type of tummy. From burgers to vegan we aim to feed you. Write a Review, Win $500! Help guests by leaving a review of your favorite dishes.'},
 {'title': 'R. THOMAS DELUXE GRILL - 1009 Photos & 1167 Reviews - Yelp',
  'href': 'https://www.yelp.com/biz/r-thomas-deluxe-grill-atlanta',
  'body': 'R. Thomas Deluxe Grill 3.9 (1,167 reviews) Claimed $$ Vegetarian, Vegan, Breakfast & Brunch Open 7:00 AM - 11:00 PM See hours Verified by the business 2 months ago See all 1.0k photos Write a review Add photo Menu Popular dishes View full menu French Toast 43 Photos 168 Reviews Thai Express 14 Photos 56 Reviews Spicy Fish Tacos 19 Photos 49 Reviews'},
 {'title': 'R. Thomas Deluxe Grill - American Restaurant in Atlanta',
  'href': 'h

In [17]:
detailed_result_list[0]

{'title': 'R. Thomas Deluxe Grill',
 'href': 'https://www.rthomasdeluxegrill.net/',
 'body': 'R. Thomas Deluxe Grill R. Thomas Deluxe Grill in Atlanta, GA. Food for everyone! We use fresh vegetables, fruit and meat across our menu. We have something yummy for every type of tummy. From burgers to vegan we aim to feed you. Write a Review, Win $500! Help guests by leaving a review of your favorite dishes.'}

In [18]:
crawl_site('https://www.rthomasdeluxegrill.net/') # not using because it does not crawl efficiently

[Document(page_content='Just a moment...Enable JavaScript and cookies to continue', metadata={'source': 'https://www.rthomasdeluxegrill.net/', 'title': 'Just a moment...', 'language': 'en-US'})]

In [19]:
!pip install beautifulsoup4==4.11.2 Flask==3.0.0

Collecting Flask==3.0.0
  Using cached flask-3.0.0-py3-none-any.whl (99 kB)
Collecting blinker>=1.6.2 (from Flask==3.0.0)
  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
Installing collected packages: blinker, Flask
  Attempting uninstall: blinker
    Found existing installation: blinker 1.4
[31mERROR: Cannot uninstall 'blinker'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.[0m[31m
[0m

In [20]:
!pip install crochet==2.1.1
!pip install Scrapy==2.11.0
!pip install readability-lxml==0.8.1



In [21]:
import crochet
crochet.setup()

import bs4
from duckduckgo_search import DDGS
from openai import OpenAI
import threading
import queue
from readability import Document
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.settings import Settings

In [36]:
CHUNK_SIZE = 13500
CHUNK_OVERLAP = 100

ddgs = DDGS()

def extract_useful_information_from_single_chunk(client, url, title, text, ix, q=None):
    '''
    This function takes the url, title, and a chunk of text of a webpage, and it asks
    openai to extract only the useful information from the text. It returns the result,
    which is a string of text, and it also puts the result in a queue if a queue is passed in.
    '''
    # in this function, we will take the url, title, and some text extracted from the webpage
    # by bs4, and we will ask openai to extract only the useful information from the text

    prompt = f"""
    You will be given information about a place. Your task is to extract and summarize the key information. If there is no information, simply return "No Important Information Found\n".
    Key information such as the environment and atmosphere of the place. If possible, estimate the range of the cost, and give some recommendations of what food people ordered or activities they did.
    Try not to rewrite the text, but instead extract only the useful information from the text.

    Here is a url: {url}
    Here is its title: {title}
    Here is some text extracted from the webpage:
    {text}
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        max_tokens=500,
        temperature=0.2,
        top_p=0.5,
        frequency_penalty=0.3,
        messages=
        [
            {
                "role": "system",
                "content": "You are a helpful assistant to help finding important information.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],

    )
    if q:
        q.put((ix, response.choices[0].message.content))

    text = response.choices[0].message.content.strip()

    # sometimes the first line is something like "Useful information extracted from the text:", so we remove that
    lines = text.splitlines()
    if "useful information" in lines[0].lower():
        text = '\n'.join(lines[1:])

    return (ix, text)

def extract_useful_information(client, url, title, text, max_chunks):
    '''
    This function takes the url, title, and text of a webpage.
    It returns the most useful information from the text.

    , and it calls
    extract_useful_information_from_single_chunk to extract the useful information.

    It does this by breaking the text into chunks, and then calling
    extract_useful_information_from_single_chunk on each chunk (which is turn calls openai).
    It then concatenates the results from all the chunks.

    It uses threading to do this in parallel, because openai is slow.
    '''
    # Create the chunks with the specified size and overlap
    chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE - CHUNK_OVERLAP)]
    chunks = chunks[:max_chunks]

    threads = []

    q = queue.Queue()

    for ix, chunk in enumerate(chunks):
        t = threading.Thread(target=extract_useful_information_from_single_chunk, args=(client, url, title, chunk, ix, q))
        threads.append(t)
        t.start()

    # Wait for all threads to complete
    for t in threads:
        t.join()

    # Get all the results from the queue
    results = []
    while not q.empty():
        results.append(q.get())

    # Sort the results by the index
    results.sort(key=lambda x: x[0])

    # concatenate the text from the results
    text = ''.join([x[1] for x in results])

    return text

def readability(input_text):
    '''
    This function will use the readability library to extract the useful information from the text.
    Document is a class in the readability library. That library is (roughly) a python
    port of readability.js, which is a javascript library that is used by firefox to
    extract the useful information from a webpage. We will use the Document class to
    extract the useful information from the text.
    '''

    doc = Document(input_text)

    summary = doc.summary()

    # the summary is html, so we will use bs4 to extract the text
    soup = bs4.BeautifulSoup(summary, 'html.parser')
    summary_text = soup.get_text()

    return summary_text

def remove_duplicate_empty_lines(text):
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Replace multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)

    # Replace multiple tabs with a single tab
    text = re.sub(r'\t+', '\t', text)

    return text

class MySpider(scrapy.Spider):
    '''
    This is the spider that will be used to crawl the webpages. We give this to the scrapy crawler.
    '''
    name = 'myspider'
    start_urls = None
    clean_with_llm = False
    results = []

    def __init__(self, start_urls, clean_with_llm, *args, **kwargs):
        super(MySpider, self).__init__(*args, **kwargs)
        self.start_urls = start_urls
        self.clean_with_llm = clean_with_llm

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response):
        body_html = response.body.decode('utf-8')

        url = response.url

        soup = bs4.BeautifulSoup(body_html, 'html.parser')
        # Check if title tag exists
        if soup.title:
            title = soup.title.string
        else:
            title = "No Title Found"
        text = soup.get_text()
        text = remove_duplicate_empty_lines(text)

        if self.clean_with_llm:
            client = OpenAI(api_key=openai_api_key)
            useful_text = extract_useful_information(client, url, title, text, 50)
        else:
            useful_text = readability(body_html)
        useful_text = remove_duplicate_empty_lines(useful_text)

        self.results.append({
            'url': url,
            'title': title,
            # 'text': text,
            'text': '',
            'useful_text': useful_text
        })



@crochet.run_in_reactor
def run_spider(url_list, clean_with_llm):
    # Define custom settings
    custom_settings = {
        'LOG_ENABLED': False,  # disable logging.
        'RANDOMIZE_DOWNLOAD_DELAY': True, # random .5 - 1.5 seconds
    }

    # Create a settings object
    settings = Settings()
    settings.setdict(custom_settings)

    # Create a CrawlerRunner with the custom settings
    crawler = CrawlerRunner(settings)
    deferred = crawler.crawl(MySpider, start_urls=url_list, clean_with_llm=clean_with_llm)
    return deferred

def ddgsearch(query, numresults=10, clean_with_llm=False):
    '''
    This function performs a search on duckduckgo and returns the results.
    It uses the scrapy library to download the pages and extract the useful information.
    It extracts useful information from the pages using either the readability library
    or openai, depending on the value of clean_with_llm.

    query: the query to search for
    numresults: the number of results to return
    clean_with_llm: if True, use openai to clean the text. If False, use readability.
    '''

    # perform the search
    results = list(ddgs.text(query, max_results=numresults))

    # get the urls
    urls = [result['href'] for result in results]
    urls = urls[:numresults]

    print(urls)
    MySpider.results = []
    eventual_result = run_spider(urls, clean_with_llm)

    # Wait for the specified time or until the result is ready
    try:
        results = eventual_result.wait(timeout=20.0)
    except crochet.TimeoutError:
        raise Exception("The scraping operation timed out.")

    return MySpider.results

def summarize_reviews(client, crawled_reviews, place_type):
    all_reviews = ''
    for crawled_review in crawled_reviews:
        all_reviews += crawled_review['useful_text']

    if (place_type == 'food and dining'):
        prompt = f"""
        "From the following paragraph about a restaurant, please identify and summarize the key details regarding:
        1) the estimated cost in the restaurant,
        2) the most popular dishes or what people commonly order,
        and 3) the environment and atmosphere of the restaurant.
        {all_reviews}"
        """
    else:
        prompt = f"""
        "From the following paragraph about a place, please identify and summarize the key details regarding:
        1) the estimated cost of entry,
        2) the most popular activitiess,
        and 3) the environment and atmosphere of the place.
        {all_reviews}"
        """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        max_tokens=500,
        temperature=0,
        messages=
        [
            {
                "role": "system",
                "content": "You are a helpful assistant.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],

    )

    return response.choices[0].message.content.strip()


In [37]:
place_name = "R. Thomas Deluxe Grill"
place_type = "food and dining"
results = ddgsearch(place_name, numresults=5, clean_with_llm=True)
client = OpenAI(api_key=openai_api_key)
summarize_reviews(client, results, place_type)

'1) The estimated cost in the restaurant is moderate ($$).\n2) The most popular dishes or what people commonly order include the spicy fish tacos, "The Champ" fresh-made juice, big breakfast special with French toast and strawberries, French toast, Thai Express, and Mojo Jojo smoothie.\n3) The environment and atmosphere of the restaurant is described as laid back with vibrant and whimsical decor, including caged parrots outside. It is known for its unique and fun atmosphere, with spot-on service and amazing food.'

In [33]:
results

[{'url': 'https://rthomasdeluxegrill.business.site/',
  'title': 'R. Thomas Deluxe Grill - American Restaurant in Atlanta',
  'text': '',
  'useful_text': 'The R. Thomas Deluxe Grill in Atlanta is known for its unique and fun atmosphere, with spot-on service and amazing food. The restaurant is open until 5:00 AM on Thursdays, Fridays, and Saturdays. Customers have recommended the spicy fish tacos, "The Champ" fresh-made juice, and the big breakfast special with French toast and strawberries. The food is described as delicious and reasonably priced. No cost range was provided.'},
 {'url': 'https://www.facebook.com/rthomasdeluxegrill/',
  'title': 'R. Thomas Deluxe Grill | Atlanta GA',
  'text': '',
  'useful_text': "I'm sorry, but I cannot access external websites or URLs. Therefore, I am unable to extract information from the provided URL. If you have specific information about a place that you would like me to summarize, please provide the details directly."},
 {'url': 'https://www.ye