In [1]:
from download_websites import download_websites
from generate_scraping_tasks import generate_scraping_tasks
from generate_solution_code import generate_solution_code
from extract_html_info import extract_relevant_information

import pandas as pd
import os
import json

In [3]:
websites_csv = 'websites_mi.csv'
save_folder = 'downloaded_pages'
ROWS = (0, 10000)
NUM_TASKS_PER_WEBSITE = 10

# Download websites
# download_websites(websites_csv, save_folder)

# Generate scraping tasks and solutions
websites = pd.read_csv(websites_csv)

for i, row in websites.iterrows():
    if i < ROWS[0] or i > ROWS[1]:
        continue
    
    print("--------------------")

    category, website, link = row
    HTML_file = f'{save_folder}/{website}.html'

    if os.path.exists(f'solution_code/{website}'):
        print(f'{website} already has data')
        continue

    if not os.path.exists(HTML_file):
        print(f'{website}.html does not exist')
        continue

    print(f"Generating data for {website}")

    HTML_elements = extract_relevant_information(HTML_file)
    with open(f'extracted_info/{website}.txt', 'w') as f:
        f.write(HTML_elements)
    
    print(f'Extracted HTML elements for {website}')

    generate_scraping_tasks(link, website, category, HTML_elements, NUM_TASKS_PER_WEBSITE)
    with open(f'scraping_tasks/{website}.txt', 'r') as f:
        scraping_tasks = pd.DataFrame([{'task': task} for task in f.readlines()])
    
    print(f'Generated scraping tasks for {website}')

    for j, task_name in scraping_tasks.iterrows():
        task_name = task_name['task']
        generate_solution_code(website, HTML_file, category, HTML_elements, task_name, j)
     
    print(f'Generated solutions for {website}')

print("--------------------")

--------------------
google play.html does not exist
--------------------
zeromike.html does not exist
--------------------
Generating data for woman
Extracted HTML elements for woman
Generated scraping tasks for woman
Generated solutions for woman
--------------------
Generating data for wordpress
Extracted HTML elements for wordpress
Generated scraping tasks for wordpress
Generated solutions for wordpress
--------------------
Generating data for bloggersroad
Extracted HTML elements for bloggersroad
Generated scraping tasks for bloggersroad
Generated solutions for bloggersroad
--------------------
Generating data for bog & ide
Extracted HTML elements for bog & ide
Generated scraping tasks for bog & ide
Generated solutions for bog & ide
--------------------
Generating data for globestudios
Extracted HTML elements for globestudios
Generated scraping tasks for globestudios
Generated solutions for globestudios
--------------------
Generating data for h&m
Extracted HTML elements for h&m
Ge

Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)

In [30]:
# Combine all scraping tasks and solutions into one file

# Load prompts
system_prompt = open('prompts/generate_solution_code_system_prompt.txt', 'r').read()
user_prompt = open('prompts/generate_solution_code_user_prompt.txt', 'r').read()

# Load extracted info
extracted_info = {}
for filename in os.listdir('extracted_info'):
    with open(f'extracted_info/{filename}', 'r') as f:
        extracted_info[filename.split('.')[0]] = f.read()
    
# Load scraping tasks
scraping_tasks = {}
for filename in os.listdir('scraping_tasks'):
    with open(f'scraping_tasks/{filename}', 'r') as f:
        scraping_tasks[filename.split('.')[0]] = f.readlines()

# Load solutions
solutions = {}
for folder in os.listdir('solution_code'):
    solutions[folder] = {}
    for filename in os.listdir(f'solution_code/{folder}'):
        with open(f'solution_code/{folder}/{filename}', 'r') as f:
            solutions[folder][filename.split('.')[0]] = f.read()

# Load website infos
websites = pd.read_csv('websites.csv')

# Generate prompts
prompts = {} # Dictionary of dictionaries
categories = {}
links = {}
for i, row in websites.iterrows():
    category, website, link = row
    categories[website] = category
    links[website] = link
    if website not in scraping_tasks:
        print(f'{website} does not have generated data')
        continue
    scraping_tasks_website = scraping_tasks[website]
    HTML_file = f'downloaded_pages/{website}.html'
    prompts[website] = {}
    for j, scraping_task in enumerate(scraping_tasks_website):
        scraping_task = scraping_task.strip()
        solution = solutions[website][f'{website}_{j}']
        HTML_elements = extracted_info[website]
        prompts[website][f'{website}_{j}'] = user_prompt.format(website=website, HTML_file=HTML_file, category=category, HTML_string=HTML_elements, task=scraping_task)

# Generate training samples
training_samples = {}
sample_template = """### System:
{system_prompt}

### User:
{user_prompt}

### Response:
```
{response}
```
"""
system_prompt = open('prompts/generate_solution_code_system_prompt.txt', 'r').read()
for website in prompts:
    training_samples[website] = {}
    for task_name in prompts[website]:
        sample = sample_template.format(system_prompt=system_prompt, user_prompt=prompts[website][task_name], response=solutions[website][task_name])
        training_samples[website][task_name] = sample

# Convert prompts, solutions and training samples to lists
websites_list = []
task_names_list = []
categories_list = []
links_list = []
prompts_list = []
solutions_list = []
training_samples_list = []
for website in prompts:
    for task_name in prompts[website]:
        websites_list.append(website)
        task_names_list.append(task_name)
        categories_list.append(categories[website])
        links_list.append(links[website])
        prompts_list.append(prompts[website][task_name])
        solutions_list.append(solutions[website][task_name])
        training_samples_list.append(training_samples[website][task_name])

# Combine to a list of dictionaries
data = []
for i in range(len(prompts_list)):
    data.append({
        'website': websites_list[i],
        'task': task_names_list[i],
        'category': categories_list[i],
        'link': links_list[i],
        'prompt': prompts_list[i],
        'solution': solutions_list[i],
        'training_sample': training_samples_list[i]
    })
print("Dataset length:", len(data))

# Save prompts, solutions and training samples as a json file
# where each 'row' is a prompt, solution and training sample
# for a particular website and task
with open('dataset.json', 'w') as f:
    json.dump(data, f)

bbc_weather does not have generated data
edx does not have generated data
etsy does not have generated data
avsforum does not have generated data
aliexpress does not have generated data
accuweather does not have generated data
ebay does not have generated data
coursera does not have generated data
cnn does not have generated data
nytimes does not have generated data
nfl does not have generated data
wunderground does not have generated data
udemy does not have generated data
target does not have generated data
foreca does not have generated data
theguardian does not have generated data
mitocw.mit.edu does not have generated data
snagajob does not have generated data
yale.edu does not have generated data
finance.yahoo does not have generated data
khanacademy does not have generated data
superpages does not have generated data
weather does not have generated data
snapchat does not have generated data
goal does not have generated data
reuters does not have generated data
jstor does not hav

In [35]:
loaded_data = json.load(open('dataset.json', 'r'))
# print("Loaded data length:", len(loaded_data))
# print("Website:", loaded_data[0]['website'])
# print("Task:", loaded_data[0]['task'])
# print("Category:", loaded_data[0]['category'])
# print("Link:", loaded_data[0]['link'])
# print("Prompt:\n" + loaded_data[0]['prompt'])
# print("Solution:", loaded_data[0]['solution'])
print("Training sample:", loaded_data[0]['training_sample'])

Training sample: ### System:
When asked to write a script, then write just the code, and nothing else. Don't write any explanation, comments, or disclaimers.

### User:
You are given a web page, the category of the page, randomly selected html elements on that page, the local path to the HTML file that should be scraped and a web-scraping task that you should solve.

Here are some randomly selected HTML elements (containing text), and their corresponding XPaths from the target page:
<title>(83) Update on $50k NVDA Puts : wallstreetbets</title>
/html/head/title
----------------
<span>Flipping at the Grand Exchange</span>
/html/body/div[1]/div/div[2]/div[2]/div/div/div/div[2]/div[3]/div[1]/div[3]/div[5]/div/div/div/div[2]/div/div/div/div[2]/div[2]/div[1]/span[2]/div/span
----------------
<span class="_1RIl585IYPW6cmNXwgRz0J">User account menu</span>
/html/body/div[1]/div/div[2]/div[1]/header/div/div[2]/div[2]/div/div[2]/button/span[2]
----------------
<a class="_3t5uN8xUmg0TOwRCOGQEcU">r

In [28]:
loaded_data[0]

{'prompt': 'You are given a web page, the category of the page, randomly selected html elements on that page, the local path to the HTML file that should be scraped and a web-scraping task that you should solve.\n\nHere are some randomly selected HTML elements (containing text), and their corresponding XPaths from the target page:\n<title>(83) Update on $50k NVDA Puts : wallstreetbets</title>\n/html/head/title\n----------------\n<span>Flipping at the Grand Exchange</span>\n/html/body/div[1]/div/div[2]/div[2]/div/div/div/div[2]/div[3]/div[1]/div[3]/div[5]/div/div/div/div[2]/div/div/div/div[2]/div[2]/div[1]/span[2]/div/span\n----------------\n<span class="_1RIl585IYPW6cmNXwgRz0J">User account menu</span>\n/html/body/div[1]/div/div[2]/div[1]/header/div/div[2]/div[2]/div/div[2]/button/span[2]\n----------------\n<a class="_3t5uN8xUmg0TOwRCOGQEcU">r/wallstreetbets</a> sir, not \n/html/body/div[1]/div/div[2]/div[2]/div/div/div/div[2]/div[3]/div[1]/div[3]/div[5]/div/div/div/div[47]/div/div/div