In [1]:
import os
import pickle
import sys
import time
from collections import defaultdict
from typing import Dict, List

import requests
from dotenv import load_dotenv


import os
from typing import List, Dict
from dotenv import load_dotenv
from youdotcom import You

load_dotenv()

True

In [2]:
from langchain_community.tools import BraveSearch
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_community.tools import DuckDuckGoSearchRun

from exa_py import Exa
from langchain_core.tools import Tool
from langchain_google_community import GoogleSearchAPIWrapper
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_community.tools import JinaSearch
from langchain_community.utilities import SearchApiAPIWrapper
from serpapi import GoogleSearch
from langchain_community.tools.tavily_search import TavilySearchResults
from duckduckgo_search import DDGS
from tavily import TavilyClient

import pandas as pd

In [3]:
def get_brave(q, ans_count=10):
    BRAVE_API_KEY = os.getenv("BRAVE_API_KEY")

    tool = BraveSearch.from_api_key(
        api_key=BRAVE_API_KEY, search_kwargs={"count": ans_count}
    )

    headers = {
        "X-Subscription-Token": BRAVE_API_KEY,
        "Accept": "application/json",
    }
    req = requests.PreparedRequest()
    params = {"q": q, "extra_snippets": True}
    req.prepare_url("https://api.search.brave.com/res/v1/web/search", params)
    time.sleep(1)
    return {
        "tool_response": tool.run(q),
        "raw_response": requests.get(req.url, headers=headers).json(),
    }


def get_duckduckgo(q, ans_count=10):
    search = DuckDuckGoSearchResults(
        output_format="list", num_results=ans_count, return_direct=True
    )

    try:
        return search.invoke(q)
    except Exception as e:
        return e


# def search_and_contents(query: str):
#     """Search for webpages based on the query and retrieve their contents."""
#     # This combines two API endpoints: search and contents retrieval
#     return exa.search_and_contents(
#         query, use_autoprompt=True, num_results=5, text=True, highlights=True
#     )


# def find_similar_and_contents(url: str):
#     """Search for webpages similar to a given URL and retrieve their contents.
#     The url passed in should be a URL returned from `search_and_contents`.
#     """
#     # This combines two API endpoints: find similar and contents retrieval
#     return exa.find_similar_and_contents(url, num_results=5, text=True, highlights=True)


def get_exa(q, ans_count=10):
    EXA_API_KEY = os.getenv("EXA_API_KEY")
    exa = Exa(api_key=EXA_API_KEY)

    return exa.search_and_contents(q, text=True, num_results=ans_count)


def get_google(q, ans_count=10):
    GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

    google_search = GoogleSearchAPIWrapper(
        google_api_key=GOOGLE_API_KEY, google_cse_id=GOOGLE_CSE_ID
    )

    def top_results(q, ans_count=ans_count):
        return google_search.results(q, ans_count)

    tool = Tool(
        name="google_search",
        description="Search Google for recent results.",
        func=top_results,
    )

    return tool.invoke({"query": q})


def get_google_serper(q, ans_count=10):
    SERPER_API_KEY = os.getenv("SERPER_API_KEY")

    search = GoogleSerperAPIWrapper(serper_api_key=SERPER_API_KEY, k=ans_count)
    search_result = search.results(q)

    return search_result


def get_searchapi(q, ans_count=10):
    SEARCHAPI_API_KEY = os.getenv("SEARCHAPI_API_KEY")

    search = SearchApiAPIWrapper(
        searchapi_api_key=SEARCHAPI_API_KEY, num_results=ans_count
    )
    results_searchapi = search.results(q)

    return results_searchapi


def get_serpapi(q, ans_count=10):
    SERP_API = os.getenv("SERP_API")
    params = {"q": q, "api_key": SERP_API, "output": "html", "num": ans_count}
    search = GoogleSearch(params)

    return search.get_dict()


def get_tavily(q, ans_count=10):
    TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

    client = TavilyClient(TAVILY_API_KEY)
    response = client.search(query=q, search_depth="advanced", max_results=ans_count)

    return response["results"]


def get_you(q: str, ans_count: int = 10) -> List[Dict]:
    """
    Search using You.com API and return formatted results.

    Args:
        q: Search query
        ans_count: Number of results to return (default: 10)

    Returns:
        List of dictionaries from results.results.model_dump()
    """
    YDC_API_KEY = os.getenv("YDC_API_KEY")

    try:
        with You(YDC_API_KEY) as you:
            results = you.search.unified(
                query=q,
                count=ans_count,
                livecrawl="all",
            )

            # Get the model dump of results
            results_data = results.results.model_dump()

            return results_data

    except Exception as e:
        print(f"Error in You.com search: {e}")
        return []


def get_jina(q: str, ans_count: int = 10) -> List[Dict[str, str]]:
    """
    Search using Jina AI API and return formatted results.

    Args:
        q: Search query
        ans_count: Number of results to return (default: 10)

    Returns:
        List of dictionaries with keys: title, link, snippet, content
    """
    JINA_API_KEY = os.getenv("JINA_API_KEY")

    # URL encode the query
    import urllib.parse

    encoded_query = urllib.parse.quote(q)
    url = f"https://s.jina.ai/?q={encoded_query}"

    headers = {
        "Accept": "application/json",
        "Authorization": f"Bearer {JINA_API_KEY}",
        "X-Engine": "direct",
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Parse JSON and get data array
        data = response.json().get("data", [])

        # Format each item
        formatted_results = []
        for item in data:
            formatted_item = {
                "title": item.get("title", ""),
                "link": item.get("url", ""),
                "snippet": item.get("description", ""),
                "content": item.get("content", ""),
            }
            formatted_results.append(formatted_item)

        return formatted_results

    except Exception as e:
        print(f"Error in Jina search: {e}")
        return []


# results['answers']['brave'] = get_brave(q)
# results['answers']['duckduckgo'] = get_duckduckgo(q)
# results['answers']['exa'] = get_exa(q)
# results['answers']['google'] = get_google(q)
# results['answers']['google_serper'] = get_google_serper(q)
# results['answers']['jina'] = get_jina(q)
# results['answers']['searchapi'] get_searchapi(q)
# results['answers']['serpapi'] get_serpapi(q)
# results['answers']['tavily'] = get_tavily(q)
# results['answers']['you'] = get_you(q)

In [4]:
ground_truth = pd.read_csv("ground_truth.csv")
companies = ground_truth.company

In [5]:
# import concurrent.futures

# def run_with_timeout(func, args=(), kwargs={}, timeout=10):
#     """
#     –ó–∞–ø—É—Å–∫–∞–µ—Ç func —Å —Ç–∞–π–º–∞—É—Ç–æ–º.
#     –ï—Å–ª–∏ –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–µ func –∑–∞–Ω–∏–º–∞–µ—Ç –±–æ–ª—å—à–µ `timeout` —Å–µ–∫—É–Ω–¥, –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç 'timeout'.

#     :param func: —Ñ—É–Ω–∫—Ü–∏—è, –∫–æ—Ç–æ—Ä—É—é –Ω—É–∂–Ω–æ –≤—ã–∑–≤–∞—Ç—å
#     :param args: –ø–æ–∑–∏—Ü–∏–æ–Ω–Ω—ã–µ –∞—Ä–≥—É–º–µ–Ω—Ç—ã –¥–ª—è func
#     :param kwargs: –∏–º–µ–Ω–æ–≤–∞–Ω–Ω—ã–µ –∞—Ä–≥—É–º–µ–Ω—Ç—ã –¥–ª—è func
#     :param timeout: –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –≤—Ä–µ–º—è –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è –≤ —Å–µ–∫—É–Ω–¥–∞—Ö
#     """
#     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
#         future = executor.submit(func, *args, **kwargs)
#         try:
#             return future.result(timeout=timeout)
#         except concurrent.futures.TimeoutError:
#             return "timeout"
#         except Exception as e:
#             return f"error: {e}"


In [6]:
# quaries = ['CEO in 2024', 'revenue in 2024', 'gross margin in 2024']
# searchers = ['brave', 'duckduckgo', 'exa', 'google', 'google_serper', 'jina', #'searchapi',
#              'serpapi', 'tavily', 'you']
# # results = {}
# results = defaultdict(lambda: defaultdict(dict))

# module = sys.modules[__name__]

# for c in companies:
#     print(f'Searching for {c}...')
#     for q in quaries:
#         for s in searchers:
#             # x = f'{c} {q}'
#             # func = lambda x=x: getattr(module, f'get_{s}')(x)
#             func = getattr(module, f'get_{s}')
#             try:
#                 print(f'{c} {q}')
#                 results[c][q][s] = func(q=f'{c} {q}')
#             except:
#                 results[c][q][s] = 'fail'

#             # print(f'{s}: {c} {q}')
#             # results.setdefault(c, {}).setdefault(q, {})[s] = run_with_timeout(func, timeout=300)


In [7]:
queries = ["CEO in 2024", "revenue in 2024", "gross margin in 2024"]
searchers = [
    "brave",
    "duckduckgo",
    "exa",
    "google",
    "google_serper",
    "jina",  #'searchapi',
    "serpapi",
    "tavily",
    "you",
]


def ddict():
    return defaultdict(dict)


def start_searching(s):
    results = defaultdict(ddict)
    module = sys.modules[__name__]
    for c in companies:
        print(f"Searching for {c}...")
        for q in queries:
            func = getattr(module, f"get_{s}")
            try:
                search_res = func(q=f"{c} {q}")
                results[c][q] = search_res
                print(f"{c} {q} - {search_res}")
            except:
                results[c][q] = "fail"
            time.sleep(1)
    return results

In [10]:
search_engine = "tavily"

results_google = start_searching(search_engine)

Searching for Adyen...
Adyen CEO in 2024 - [{'url': 'https://en.wikipedia.org/wiki/Adyen', 'title': 'Adyen - Wikipedia', 'content': '| Company type | Naamloze vennootschap |\n| Traded as |  Euronext Amsterdam: ADYEN  AEX component |\n| ISIN | NL0012969182 US00783V1044 |\n| Industry | Payment processor, technology, e-commerce, point of sale |\n| Founded | 2006; 20 years ago (2006) |\n| Founders |  Arnout Schuijff  Pieter van der Does "Pieter van der Does (businessman)") |\n| Headquarters | Amsterdam , Netherlands |\n| Key people |  Pieter van der Does (CEO) |\n| Services | Payment service provider, gateway, risk management, local acquiring, point of sale, issuing |\n| Revenue | ‚Ç¨1.996 billion (2024) |\n| Net income | ‚Ç¨925 million (2024) |\n| Total assets | ‚Ç¨11.425 billion (2024) |\n| Total equity "Equity (finance)") | ‚Ç¨4.232 billion (2024) |\n| Number of employees | 4,345 (2024) |\n| Website | adyen.com |\n| Footnotes / references  | [...] Adyen is a Dutch payment company with t

In [11]:
results_google

defaultdict(<function __main__.ddict()>,
            {'Adyen': defaultdict(dict,
                         {'CEO in 2024': [{'url': 'https://en.wikipedia.org/wiki/Adyen',
                            'title': 'Adyen - Wikipedia',
                            'content': '| Company type | Naamloze vennootschap |\n| Traded as |  Euronext Amsterdam: ADYEN  AEX component |\n| ISIN | NL0012969182 US00783V1044 |\n| Industry | Payment processor, technology, e-commerce, point of sale |\n| Founded | 2006; 20 years ago (2006) |\n| Founders |  Arnout Schuijff  Pieter van der Does "Pieter van der Does (businessman)") |\n| Headquarters | Amsterdam , Netherlands |\n| Key people |  Pieter van der Does (CEO) |\n| Services | Payment service provider, gateway, risk management, local acquiring, point of sale, issuing |\n| Revenue | ‚Ç¨1.996 billion (2024) |\n| Net income | ‚Ç¨925 million (2024) |\n| Total assets | ‚Ç¨11.425 billion (2024) |\n| Total equity "Equity (finance)") | ‚Ç¨4.232 billion (2024) |\n| N

In [12]:
len(results_google.keys())

20

In [13]:
results_google["Zalando"]["CEO in 2024"]

[{'url': 'https://corporate.zalando.com/sites/default/files/media-download/Zalando-SE_AGM-2025_1_Annual-report_2024.pdf',
  'title': 'Zalando-SE_AGM-2025_1_Annual-report_2024.pdf',
  'content': 'to employees of the Zalando group and to members of the management board. At its extraordinary meeting on 30 April 2024, the supervisory board discussed and approved the adoption of the responsibilities within the management board, and in particular the envisaged change of the co-CEO position from David Schneider to David Schr√∂der to ensure the effective execution of the updated group strategy. [...] Adapting responsibilities within the management board To ensure the effective execution of its updated strategy, the supervisory board adapted the responsibilities within Zalando‚Äôs management board. Co-founder David Schneider decided to move into a management board role fully dedicated to building partner relationships across 1 Company 2 Combined management report 3 Consolidated financial statem

In [14]:
with open(f"data/search_results_{search_engine}.pkl", "wb") as f:
    pickle.dump(results_google, f)