In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import json
from datetime import datetime
from markdown import markdown
from IPython.display import Markdown, display, HTML
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
load_dotenv()

True

In [2]:
os.chdir(os.path.dirname(os.getcwd()))

In [31]:
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse, parse_qs
from readability import Document
from datetime import datetime
from bs4 import BeautifulSoup
import requests
import openai
import ujson

COMPLETION_MODEL = "gpt-4-turbo-preview"
SOURCE_COUNT = 10

def generate_search_query(text: str, model="gpt-4-turbo-preview") -> str:
    """
    Uses OpenAI's ChatCompletions to generate a search query from a given text.

    ### Example:
    For the text `What is the new Discord username system?`, a search query similar to `discord new username system` would be generated.
    """
    client = openai.OpenAI()
    return client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "Given a query, respond with the Google search query that would best help to answer the query. Don't use search operators. Respond with only the Google query and nothing else."},
            {"role": "user", "content": text}
        ]
    ).choices[0].message.content


def get_google_search_links(query: str, source_count: int = SOURCE_COUNT, proxies: dict = None) -> list[str]:
    """
    Scrapes the official Google search page using the `requests` module and returns the first `source_count` links.

    Args:
        query (str): The search query.
        source_count (int): The number of links to return.
        proxies (dict, optional): Proxies to use for the request. Defaults to None.

    Returns:
        list[str]: A list of the first `source_count` links from the Google search.
    """
    url = f"https://www.google.com/search?q={query}"
    if proxies:
        response = requests.get(url, proxies=proxies)
    else:
        response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    link_tags = soup.find_all("a")
    
    links = []
    for link in link_tags:
        href = link.get("href")
        if href and href.startswith("/url?q="):
            cleaned_href = parse_qs(urlparse(href).query)["q"][0]
            if cleaned_href not in links:
                links.append(cleaned_href)

    filtered_links = []
    for link in links:
        domain = urlparse(link).hostname
        if domain and not any(site in domain for site in ["google", "facebook", "twitter", "instagram", "youtube", "tiktok"]):
            if not any(new_url.hostname == domain for new_url in [urlparse(l) for l in filtered_links]):
                filtered_links.append(link)
    
    return filtered_links[:source_count]

def scrape_text_from_links(links: list, proxies: dict = None) -> list[dict]:   
    """
    Uses a `ThreadPoolExecutor` to run `scrape_text_from_links` on each link in `links` concurrently, allowing for lightning-fast scraping.
    """ 
    with ThreadPoolExecutor(max_workers=len(links)) as executor:
        if proxies:
            results = list(executor.map(scrape_text_from_link, links, [proxies] * len(links)))
        else:
            results = list(executor.map(scrape_text_from_link, links))
    
    for i, result in enumerate(results, start=1):
        result["result_number"] = i

    return results
    
def scrape_text_from_link(link: str, proxies: dict = None) -> dict:
    """
    Uses the `requests` module to scrape the text from a given link, and then uses the `readability-lxml` module along with `BeautifulSoup4` 
    to parse the text into a readable format. Implements robust error handling to ensure the function completes without throwing an error.

    Args:
        link (str): The URL of the webpage to scrape.
        proxies (dict, optional): Proxies to use for the request. Defaults to None.

    Returns:
        dict: A dictionary containing the URL and the summarized text from the link. If an error occurs, returns a dictionary with the URL and an error message.
    """
    try:
        if proxies:
            response = requests.get(link, proxies=proxies)
        else:
            response = requests.get(link)
        response.raise_for_status()  # Raises a HTTPError if the response status code is 4XX/5XX
    except requests.exceptions.RequestException as e:
        return {"url": link, "error": f"Failed to retrieve content: {str(e)}"}

    try:
        doc = Document(response.text)
        parsed = doc.summary()
        soup = BeautifulSoup(parsed, "html.parser")
        source_text = soup.get_text()
    except Exception as e:
        return {"url": link, "error": f"Failed to parse content: {str(e)}"}

    try:
        summarized_text = summarize_text(source_text[:50000])
    except Exception as e:
        return {"url": link, "error": f"Failed to summarize content: {str(e)}"}

    return {"url": link, "text": summarized_text}

def summarize_text(text: str, model="gpt-4-turbo-preview") -> str:
    """
    Uses OpenAI's ChatCompletions to summarize a given text.
    """
    return openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "Given text, respond with the summarized text (no more than 500 words) and nothing else."},
            {"role": "user", "content": text}
        ]
    ).choices[0].message.content

def search(query: str, proxies: dict = None) -> tuple[list[str], list[dict]]:
    """
    This function takes a query as input, gets top Google search links for the query, and then scrapes the text from the links.
    It returns a tuple containing the list of links and a list of dictionaries. Each dictionary contains the URL and the summarized text from the link.
    """
    links = get_google_search_links(query, proxies=proxies)
    sources = scrape_text_from_links(links, proxies=proxies)

    return links, sources

def perplexity_clone(query: str, proxies: dict = None, verbose=False) -> str:
    """
    A clone of Perplexity AI's "Search" feature. This function takes a query as input and returns Markdown formatted text containing a response to the query with cited sources.
    """
    client = openai.OpenAI()
    formatted_time = datetime.utcnow().strftime("%A, %B %d, %Y %H:%M:%S UTC")

    search_query = generate_search_query(query)
    if verbose:
        print(f"Searching \"{search_query}\"...")
    links, sources = search(search_query, proxies=proxies)
    
    print(f"Links found: {links}")

    result = client.chat.completions.create(
        model=COMPLETION_MODEL,
        messages=[
            {"role": "system", "content": "Generate a comprehensive and informative answer for a given question solely based on the provided web Search Results (URL and Summary). You must only use information from the provided search results. Use an unbiased and journalistic tone. Use this current date and time: " + formatted_time + ". Combine search results together into a coherent answer. Do not repeat text. Cite sources as [1] or [2] or [3] after each sentence (not just the very end) to back up your answer (Ex: Correct: [1], Correct: [2][3], Incorrect: [1, 2]). Only cite the most relevant results that answer the question accurately. If different results refer to different entities with the same name, write separate answers for each entity."},
            {"role": "user", "content": ujson.dumps(sources)},
            {"role": "user", "content": query}
        ]
    ).choices[0].message.content
    result +="\n\n"
    result +="___"
    result +="\n\n"
    for i, link in enumerate(links, start=1):
        parsed_url = urlparse(link)
        base_url = parsed_url.netloc
        base_url = base_url.replace("www.", "")
        result = result.replace(f"[{i}]", f"<sup>[[{i}]]({link})</sup>")
        result += f"* **[[{i}]](http://{base_url}) {base_url}**\n"

    return result, sources

In [32]:
test, sources = perplexity_clone(query="2024 solar eclipse austin tx", verbose=True)
display(Markdown(test))

Searching "2024 solar eclipse path Austin TX"...
Links found: ['https://www.austintexas.org/events/eclipse-austin/', 'https://eclipse2024.org/eclipse_cities/blog-posts/what-will-the-2024-total-solar-eclipse-look-like-from-austin.html', 'https://www.austintexas.gov/readycentraltexas/eclipse', 'https://nationaleclipse.com/cities/austin_texas.html', 'https://www.reddit.com/r/Austin/comments/1937wg2/2024_eclipse_viewing_in_the_city_vs_more_rural/', 'https://www.tripadvisor.com/ShowTopic-g28964-i75-k14298478-Solar_eclipse_April_8_2024_where_to_stay_near_Austin-Texas.html', 'https://www.timeanddate.com/eclipse/in/usa/austin?iso=20240408', 'https://www.statesman.com/story/entertainment/events/2024/02/12/2024-total-solar-eclipse-path-times-austin-texas-what-you-need-to-know/72190130007/']


The 2024 Total Solar Eclipse, taking place on April 8, presents an exciting event for viewers, especially in Austin, Texas, which will be directly in the path of totality. This rare astronomical phenomenon, the first total solar eclipse visible in North America since 2017, is expected to draw large crowds to Central Texas, with Austin offering a unique viewing experience. Safety during the eclipse is crucial, with the emphasis on using certified eclipse glasses or viewers for direct observation of the sun, except during the brief phase of total eclipse where the moon completely covers the sun <sup>[[1]](https://www.austintexas.org/events/eclipse-austin/)</sup><sup>[[3]](https://www.austintexas.gov/readycentraltexas/eclipse)</sup>.

Austin, being situated at the southern limit of the totality path, requires careful planning for viewers aiming to experience the ultimate spectacle. While the city itself provides several viewing locations, such as the Austin Zoo, Lady Bird Johnson Wildflower Center, and Barton Creek Greenbelt, the duration of totality varies across different sites. For instance, the Austin Aquarium will experience a longer period of totality at approximately 2 minutes and 58 seconds compared to 1 minute and 24 seconds at the Boardwalk at Lady Bird Lake <sup>[[4]](https://nationaleclipse.com/cities/austin_texas.html)</sup>. Consequently, for an optimal eclipse experience, areas outside Austin including Burnet, Marble Falls, Killeen, and others have been recommended, with spots like Buchanan Lake signaling a prime viewing point with around 4 minutes and 20 seconds of totality <sup>[[2]](https://eclipse2024.org/eclipse_cities/blog-posts/what-will-the-2024-total-solar-eclipse-look-like-from-austin.html)</sup>.

Viewers are encouraged to prepare extensively for the event, considering the expected influx of visitors and potential traffic congestion. Making arrangements for early arrival at viewing spots, stashing supplies like food, water, and gas, and planning to stay put after the eclipse to avoid the rush are advisable strategies. Free eclipse glasses are being distributed at various locations in Central Texas to ensure public safety, and attendees are urged never to look directly at the sun without proper protection <sup>[[3]](https://www.austintexas.gov/readycentraltexas/eclipse)</sup>.

The 2024 eclipse marks a significant occasion, as it offers an opportunity not to be missed given its rarity; the last total solar eclipse observed in Austin occurred in 1397 and the next one is not expected until 2343. The event is anticipated to cast areas into total darkness between 1:35 p.m. and 1:40 p.m., highlighting the importance of securing appropriate viewing equipment, like eclipse glasses marked with the "ISO" icon, or constructing a pinhole camera for safe observation <sup>[[8]](https://www.statesman.com/story/entertainment/events/2024/02/12/2024-total-solar-eclipse-path-times-austin-texas-what-you-need-to-know/72190130007/)</sup>.

To benefit fully from this spectacular cosmic event, Austin residents and visitors are advised to heed safety guidelines and make necessary preparations, ensuring a memorable and secure eclipse viewing experience.

___

* **[[1]](http://austintexas.org) austintexas.org**
* **[[2]](http://eclipse2024.org) eclipse2024.org**
* **[[3]](http://austintexas.gov) austintexas.gov**
* **[[4]](http://nationaleclipse.com) nationaleclipse.com**
* **[[5]](http://reddit.com) reddit.com**
* **[[6]](http://tripadvisor.com) tripadvisor.com**
* **[[7]](http://timeanddate.com) timeanddate.com**
* **[[8]](http://statesman.com) statesman.com**


In [27]:
sources[0]

{'url': 'https://nationaleclipse.com/cities/austin_texas.html',
 'text': "This text provides a list of locations in Austin, Texas, with corresponding start times and durations for an eclipse totality event. It mentions various places such as the Austin Zoo, Lady Bird Johnson Wildflower Center, and the Austin Aquarium, among others, with totality start times ranging from 1:35:21 PM CDT to 1:36:20 PM CDT and durations from 1:24 to 2:58 minutes. The text also notes that these times and durations are approximate, particularly for large areas like parks and campuses, and advises using Xavier Jubier's interactive Google eclipse maps for precise timing and duration at specific locations. The duration mentioned refers to the total eclipse's duration, expressed in minutes and seconds. Additionally, there's a mention of events and festivals celebrating the eclipse in Austin.",
 'result_number': 1}