Import URL library to access URLs online

In [None]:
from bs4 import BeautifulSoup
import requests
search_website = "URL HERE"
req = requests.get(search_website)
soup = BeautifulSoup(req.text, "html.parser")

Test print

In [None]:
search_results = soup.find_all("div", class_="search-result")

In [None]:
search_results = str(search_results)
search_results

Use OpenAI to retrieve results that are relevant

In [None]:
#import dependencies

import os ## import os allows you to access operating system functions
from dotenv import load_dotenv, find_dotenv #used to locate the .env file
_ = load_dotenv(find_dotenv()) # executes the two functions in sequence. underscore (_) as a convention to indicate that the return value of load_dotenv() is not being used or assigned to a variable. makes variables in .env file available throughout.

import warnings
warnings.filterwarnings('ignore')

In [None]:
#import langchain

from langchain.chat_models import ChatOpenAI

In [None]:
llm = ChatOpenAI(temperature=0.0)
llm

In [None]:
data = open("webscraping_prompt.txt", "r")
webscraping_prompt = data.read()
data.close()

Convert HTML file into text file to improve subsequent prompt engineering

In [None]:
cleaned_articles = llm.predict("Convert the HTML file into a text output with the following format: \n 1. Article Title \n 2. Article URL \n 3. Article Published Date: \n 4. Article Description \n HTML file:" + "\n" + search_results)

In [None]:
with open("articles.txt", "w") as f:
    f.write(cleaned_articles)

In [None]:
def cutoffDate() :
    import datetime
    import dateutil.relativedelta
    today = datetime.date.today()
    cutoff = today - dateutil.relativedelta.relativedelta(days=30)
    cutoff = cutoff.strftime("%d" + " " + "%B" + " " + "%Y")

    return cutoff

In [None]:
data = open("articles.txt", "r")
search_results = data.read()
data.close()

In [None]:
scrape_processed = webscraping_prompt + "\n 2. Only articles generated after " + cutoffDate() + " should be included in the response." + "\n" + "user:" + "\n" + search_results
scrape_processed

In [None]:
websites = llm.predict(scrape_processed)
websites

In [None]:
with open('websites.json', "w") as f:
    f.write(websites)

Read JSON file as object

Begin prompt chaining to remove non-startups

In [None]:
#Begin langchain step-wise checks

from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory

lang_llm = ChatOpenAI(temperature=0.0)
memory = ConversationBufferMemory()
conversation = ConversationChain(
    llm=lang_llm,
    memory = memory,
    verbose=True
)

In [None]:
conversation.predict(input = "For each article in the JSON input, if the company mentioned in the article raised funding directly, append 'Funding raised: Yes' to the JSON input. If it was not the company that raised funding directly (i.e. it was a participant, partner, or investee), append 'Funding raised: No' to the input. Your response should be strictly a JSON file." + "\n" + websites)

In [None]:
startups_clean = conversation.predict(input = "If 'Funding raised' is 'No' for the article, remove it from the JSON input. Return the revised JSON input as your response.")

Save output as JSON file

In [None]:
startups_clean

In [None]:
with open("websites.json", "w") as f:
    f.write(startups_clean)

Load JSON file as Dict

In [None]:
import json

data = open("websites.json", "r")
startups_clean_json = dict(json.load(data))
data.close()

In [None]:
startups_clean_json

Iterate through dict and retrieve articles related to each startup in the dict

In [None]:
articles_search_cleaned = []

for article_details in startups_clean_json['articles']:
    articles_search_cleaned.append(article_details["company"])

In [None]:
articles_search_cleaned

In [None]:
output_articles = []

for search_terms in articles_search_cleaned:

    req = requests.get("SEARCH URL HERE") #Insert the search URL here
    soup = BeautifulSoup(req.text, "html.parser")

    search_results = soup.find_all("div", class_="search-result")

    output_articles.append([str(search_terms), search_results])

In [None]:
output_articles

In [None]:
for article in output_articles:
    temp_export = llm.predict("Using information in the HTML file below, extract all article URLs and respond in the format of a comma separated value file of format: Article 1 URL, Article 2 URL, .... \n Include a maximum of the 3 latest articles in your response. \n HTML file:" + "\n" + str(article[1]))

    current_dir = os.getcwd()

    file_name = article[0] + ".json"
    file_path = os.path.join("Scraped Articles", file_name)

    with open(file_path, "w") as f:
        f.write(temp_export)

Extract and save all articles to the same list. Export the list as the final text file.

In [None]:
#Function to obtain articles in a given list

def extract_article(input_list):

    counter = 0
    output_string = ""
    temp_output = ""

    for url in input_list:

        counter += 1

        temp_request = requests.get(str(url))
        temp_article = BeautifulSoup(temp_request.text, "html.parser")

        temp_byline = temp_article.find_all("p", class_="byline")
        temp_text = temp_article.find_all("div", class_="post")

        output_string = str(temp_byline)[1:-1] + "\n" + str(temp_text)[1:-1]

        prompt_article_extraction = "system: Convert the HTML content in the user input into plain text. Your response must be in the format of: \n Date: Published Date \n Content: Article content body \n You should not provide any additional information beyond the above stipulated format. \n user: \n" + output_string

        output_string = llm.predict(prompt_article_extraction)

        temp_output = temp_output + "Article " + str(counter) + "\n" + output_string + "\n" + "\n" + "\n"

    return temp_output

In [None]:
import os

# Specify the folder path
folder_path = "Scraped Articles"

# Get a list of files in the folder
files_in_folder = os.listdir(folder_path)

# Iterate through the files in the folder
for file_name in files_in_folder:

    file_path = os.path.join(folder_path, str(file_name))

    data = open(file_path, "r")
    temp_list = list(data.read().split(","))
    data.close()

    temp_extracted_article = extract_article(temp_list)

    with open(file_path, "w") as f:
        f.write(temp_extracted_article)