In [1]:
from bs4 import BeautifulSoup
import foodBanks
from us import states
import pickle
import pyperclip
import requests

In [2]:
currentFoodBanks = foodBanks.currentFoodBanks

In [3]:
with open("FoodBankList.html") as file:
    soup = BeautifulSoup(file, "html.parser")
    divs = soup.find_all("div", class_="results-box")

    foodBanks = []
    for div in divs:
        h3 = div.find("h3", class_="name")
        if h3:
            name = h3.get_text()
            p = div.find("p")
            p_text = p.get_text()            
            state_abbreviation = p_text.split(" ")[-2]
            state_lookup = states.lookup(state_abbreviation)
            if state_lookup:
                state = state_lookup.name            
                p2 = div.find("p", class_="url")
                a = p2.find("a")
                url = f"https:{a['href']}"
                foodBanks.append({
                    "name": name,
                    "url": url,
                    "state": state
                })

In [4]:
newFoodBanks = []
for food_bank in foodBanks:
    if food_bank["name"] not in currentFoodBanks:
        newFoodBanks.append(food_bank)

In [5]:
from loguru import logger
import coloredlogs
import sys

color_codes = {
    'black': '30',
    'red': '31',
    'green': '32',
    'yellow': '33',
    'blue': '34',
    'magenta': '35',
    'cyan': '36',
    'white': '37',
    'reset': '0'
}

def log(level, title, message, color):
    color_code = color_codes.get(color.lower(), '37')
    formatted_message = f"\033[37m{title}\n\033[{color_code}m{message}\033[0m"
    logger.log(level.upper(), formatted_message)

In [6]:
ollama_cache = {}

In [7]:
import re
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate

parent_count = 5

summary_prompt = """
Imagine you are a robot browsing the web, bu you can only view html. Now you need to complete a task. You are on {url}, and you have found an email address. Your job is to explain why the email address is on the page (i.e. its purpose). You will recieve an email address and a the html of its {parent_count}th parent element from the website. Using the parent element, you can explain why the email address is on the page (i.e. the context of the email address). Note that {parent_count} is chosen arbitrarily, so the html likely contains some irrelevant information to the context of the email address. Be careful about which html elemetns are children of eachother. Provide only the relevant information as the context. Do not hallucinate any context. 

The user will provide the following (in the following order):

1) The email address
2) The HTML of the {parent_count}th parent element

Then, your reply should strictly follow the format:

Thought: [Your thoughts (give any and all info that will help you choose the context and name)]
Context: [The context of the email address (what the email address is doing on the page)]            
"""

name_prompt = """
You will recieve an email address and its context (i.e. text explaining the [possible] purpose of the email address). Your job is to extract the full name [first and last name] associated with the email address, using the context. It is likely that the name is not mentioned in the context, in which case you should return 'None'. If you can't find a full name [first and last], don't bother returning a partial name—just return 'None'.

The user will provide the following (in the following order):

1) The email address
2) The context

Then, your reply should strictly follow the format:

Thought: [Your thoughts (give any and all info that will help you extract the name)]
Name: [The full name associated with the email address or 'None']
"""

link_prompt = """
You will recieve a list of links from a webpage of the food_bank {food_bank}. Your job is to select the link that is most likely to contain the main contact email address for the food bank.

The user will provide the following:

1) [link1], [link2], [link3], ... (a list of links on the page separated by commas)

Then, your reply should strictly follow the format:

Thought: [Your thoughts (give any and all info that will help you select the link)]
Link: [The link that most likely contains the main email address]
"""

email_prompt = """
Your main task is to find the main email address for {food_bank}. On {url}, {email_count} email addresses were found. Each email address also has a context which describes why that email address is on the page, given its surrounding html. You will also be provided with a list of links on the page. 

If you believe that an email address is the main email address, return that email address; only do this if you are 100 percent confident it is the main email address. If you don't believe the main email address is in the list of emails, provide the link that you believe is most likely to contain the main email address.

The user will provide the following (in the following order):

1) [email1]: [context1]; 
[email2]: [context2]; 
[email3: [context3]; 
... (a list of emails and their contexts separated by semicolons)

2) [link1], [link2], [link3], ... (a list of links on the page separated by commas)

Then, your reply should strictly follow the format:

Thought: [Your thoughts (give any and all info that will help you select the main email address or the next link)]
Email: [The main email address or None]
Link: [The link that most likely contains the main email address or None]          
"""


def get_emails(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    text = soup.get_text(separator=' ')
    emails = list(
        set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)))
    return emails


def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = set(link.get('href') for link in soup.find_all('a'))
    links = set([link for link in links if link and link.startswith(
        'http') and requests.get(current_link).status_code == 200])
    return links


def get_parent_html(url, text):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    try:
        element = soup.find_all(
            lambda tag: tag.string and text in tag.string)[0]
    except:
        return ''

    def get_parent(element, current_index):
        if current_index == parent_count:
            return element
        else:
            parent = element.parent
            if not parent:
                parent = element
            return get_parent(parent, current_index + 1)

    parent = get_parent(element, 0)

    irrelevant_attrs = ['class', 'id', 'style']

    def perge_element(element):
        for attr in irrelevant_attrs:
            if element.has_attr(attr):
                del element[attr]

    perge_element(parent)

    for tag in parent.find_all(True):
        perge_element(tag)

    return parent.prettify()


def use_ollama(func_name, system_prompt, user_prompts, output_getter):
    key = f'{func_name} {system_prompt} {" ".join(user_prompts)}'
    if key in ollama_cache:
        log('DEBUG', f'{func_name}: Cached Output', ollama_cache[key], 'cyan')
        return ollama_cache[key]

    log('DEBUG', f'{func_name}: System Prompt', system_prompt, 'blue')
    for i, user_prompt in enumerate(user_prompts):
        log('DEBUG', f'{func_name}: User Prompt {i + 1}',
            user_prompt, 'blue')

    llm = ChatOllama(model="llama3", temperature=0)

    messages = [
        SystemMessage(content=system_prompt),
        *list(map(lambda user_prompt: HumanMessage(content=user_prompt), user_prompts))
    ]

    prompt = ChatPromptTemplate.from_messages(messages)
    chain = prompt | llm | StrOutputParser()
    response = chain.invoke({})

    try:
        output = output_getter(response)
        log('DEBUG', f'{func_name}: Output', output, 'cyan')
    except:
        output = None
        log('Warning', f'{func_name}: Error',
            'Could not extract output', 'red')

    ollama_cache[key] = output

    return output


async def summarize_email_parent(email, html, food_bank, url):
    system_prompt = (summary_prompt.format(
        parent_count=parent_count, food_bank=food_bank, url=url))
    user_prompt_1 = email
    user_prompt_2 = html

    context = use_ollama("Summarize Context", system_prompt, [
                         user_prompt_1, user_prompt_2], lambda response: response.split('Context: ')[1].strip())

    return context


def get_name(email, context, food_bank):
    system_prompt = (name_prompt.format(food_bank=food_bank))
    user_prompt_1 = email
    user_prompt_2 = context

    name = use_ollama('Get Name', system_prompt, [user_prompt_1, user_prompt_2], lambda response: response.split('Name: ')[1].strip())

    name = name if name and not ('None' in name) else food_bank

    return name


def get_main_email(food_bank, url, emails_w_context, links):
    if len(emails_w_context) == 0:
        system_prompt = (link_prompt.format(food_bank=food_bank))
        user_prompt = ', '.join(links)

        def output_getter(response):
            link = response.split('Link: ')[1].strip()
            re_link = re.findall(
                r"https?://[a-zA-Z0-9./-]+", link) if link != 'None' else None
            link = re_link[0] if re_link else None
            return link

        link = use_ollama('Get Link', system_prompt, [
            user_prompt
        ], output_getter)

        return None, link
    else:
        system_prompt = (email_prompt.format(
            food_bank=food_bank, url=url, email_count=len(emails_w_context)))
        user_prompt_1 = (';\n'.join(list(map(
            lambda email_w_context: f'{email_w_context["email"]}: {email_w_context["context"]}', emails_w_context))))
        user_prompt_2 = (', '.join(links))

        def output_getter(response):
            email = response.split('Email: ')[1].split('Link: ')[0].strip()
            link = response.split('Link: ')[1].strip()
            re_email = re.findall(
                r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", email) if email != 'None' else None
            email = re_email[0] if re_email else None
            re_link = re.findall(
                r"https?://[a-zA-Z0-9./-]+", link) if link != 'None' else None
            link = re_link[0] if re_link else None
            return {'email': email, 'link': link}

        output = use_ollama('Main Email', system_prompt, [
            user_prompt_1,
            user_prompt_2
        ], output_getter)

        email = output['email'] if output else None
        link = output['link'] if output else None

        email_w_context = None
        for email_context in emails_w_context:
            if email_context['email'] == email:
                email_w_context = email_context
                break

        if email and not emails_w_context:
            log("Warning", "Email Not Found", emails_w_context, "red")

        return email_w_context, link


new_rows = []

for food_bank in newFoodBanks:
    log('Info', 'New Food Bank',
        f'{food_bank["name"]} in {food_bank["state"]} w homepage {food_bank["url"]}', 'white')
    main_email_w_context = None
    current_link = food_bank['url']
    visited_links = set()
    all_links = set()
    i = 0

    while not main_email_w_context and current_link and i < 5:
        log('Debug', 'Current Link', current_link, 'blue')

        if requests.get(current_link).status_code != 200:
            log('Warning', 'Invalid Link', 'This link is invalid', 'red')
            break

        if current_link in visited_links:
            log('Warning', 'Already Visited',
                'This link has already been visited', 'red')
            break

        visited_links.add(current_link)

        emails = get_emails(current_link)
        emails_w_html = [{'email': email, 'html': get_parent_html(current_link, email)} for email in emails]
        emails_w_context = [{'email': email_w_html['email'], 'context': await summarize_email_parent(email_w_html['email'], email_w_html['html'], food_bank['name'], current_link)} for email_w_html in emails_w_html]

        new_links = get_links(current_link)
        all_links = all_links.union(new_links)
        links = list(all_links.difference(visited_links))

        main_email_w_context, current_link = get_main_email(
            food_bank['name'], food_bank['url'], emails_w_context, links)

        i += 1

    if i == 5:
        log('Warning', 'Max Iterations Reached ❌',
            'Max iterations reached', 'red')

    if main_email_w_context:
        name = get_name(
            main_email_w_context['email'], main_email_w_context['context'], food_bank['name'])
        new_rows.append({
            'food_bank': food_bank['name'],
            'state': food_bank['state'],
            'url': food_bank['url'],
            'email': main_email_w_context['email'],
            'name': name
        })
        log('Info', f'Main Email Found for {food_bank["name"]} ✅',
            main_email_w_context['email'], 'white')

[32m2024-06-10 01:01:56.423[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog[0m:[36m20[0m - [1m[37mNew Food Bank
[37mFood Bank of Alaska, Inc. in Alaska w homepage https://www.foodbankofalaska.org/[0m[0m
[32m2024-06-10 01:01:56.424[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mlog[0m:[36m20[0m - [34m[1m[37mCurrent Link
[34mhttps://www.foodbankofalaska.org/[0m[0m
[32m2024-06-10 01:01:57.752[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mlog[0m:[36m20[0m - [34m[1m[37mSummarize Context: System Prompt
[34m
Imagine you are a robot browsing the web, bu you can only view html. Now you need to complete a task. You are on https://www.foodbankofalaska.org/, and you have found an email address. Your job is to explain why the email address is on the page (i.e. its purpose). You will recieve an email address and a the html of its 5th parent element from the website. Using the parent element, you can explain why the email address is on the page (i.e. the co

In [53]:
with open('rows.pkl', 'wb') as file:
    pickle.dump(rows, file)

[{'food_bank': 'Food Bank of Alaska, Inc.', 'state': 'Alaska', 'url': 'https://www.foodbankofalaska.org/', 'email': 'lartola@foodbankofalaska.org', 'name': 'Food Bank of Alaska'}]


In [None]:
with open('rows.pkl', 'rb') as file:
    rows = pickle.load(file)

In [None]:
index = 0

In [None]:
state_food_banks = [row for row in rows if row["state"] == states.STATES[index].name]
if state_food_banks:
    print(f"{index}  {states.STATES[index]} has {len(state_food_banks)} food banks. Copying to clipboard...")
    copyable_text = '\n'.join([f"{food_bank['food_bank']}\t{food_bank['name']}\t{food_bank['email']}" for food_bank in state_food_banks])
    pyperclip.copy(copyable_text)
else:
    print(f"No food banks found in {states.STATES[index]}")

In [None]:
index += 1