In [None]:
from download_websites import download_websites
from generate_scraping_tasks import generate_scraping_tasks
from generate_solution_code import generate_solution_code
from extract_html_info import extract_relevant_information

import pandas as pd
import os
import json

In [None]:
websites_csv = 'websites.csv'
save_folder = 'downloaded_pages'
ROWS = (0, 44)
NUM_TASKS_PER_WEBSITE = 10

# Download websites
# download_websites(websites_csv, save_folder)

# Generate scraping tasks and solutions
websites = pd.read_csv(websites_csv)

for i, row in websites.iterrows():
    if i < ROWS[0] or i > ROWS[1]:
        continue
    
    print("--------------------")

    category, website, link = row
    HTML_file = f'{save_folder}/{website}.html'

    if os.path.exists(f'solution_code/{website}'):
        print(f'{website} already has data')
        continue

    if not os.path.exists(HTML_file):
        print(f'{website}.html does not exist')
        continue

    print(f"Generating data for {website}")

    HTML_elements = extract_relevant_information(HTML_file)
    with open(f'extracted_info/{website}.txt', 'w') as f:
        f.write(HTML_elements)
    
    print(f'Extracted HTML elements for {website}')

    generate_scraping_tasks(link, website, category, HTML_elements, NUM_TASKS_PER_WEBSITE)
    with open(f'scraping_tasks/{website}.txt', 'r') as f:
        scraping_tasks = pd.DataFrame([{'task': task} for task in f.readlines()])
    
    print(f'Generated scraping tasks for {website}')

    for j, task in scraping_tasks.iterrows():
        task = task['task']
        generate_solution_code(website, HTML_file, category, HTML_elements, task, j)
     
    print(f'Generated solutions for {website}')

print("--------------------")

In [None]:
# Combine all scraping tasks and solutions into one file

# Load prompts
system_prompt = pd.read_csv('prompts/generate_solution_code_system_prompt.txt')
user_prompt = pd.read_csv('prompts/generate_solution_code_user_prompt.txt')

# Load extracted info
extracted_info = {}
for filename in os.listdir('extracted_info'):
    with open(f'extracted_info/{filename}', 'r') as f:
        extracted_info[filename.split('.')[0]] = f.read()
    
# Load scraping tasks
scraping_tasks = {}
for filename in os.listdir('scraping_tasks'):
    with open(f'scraping_tasks/{filename}', 'r') as f:
        scraping_tasks[filename.split('.')[0]] = f.readlines()

# Load solutions
solutions = {}
for folder in os.listdir('solutions'):
    solutions[folder] = {}
    for filename in os.listdir(f'solutions/{folder}'):
        with open(f'solutions/{folder}/{filename}', 'r') as f:
            solutions[folder][filename.split('.')[0]] = f.read()

# Load website infos
websites = pd.read_csv(websites_csv)

# Generate prompts
prompts = {} # Dictionary of dictionaries
for i, row in websites.iterrows():
    category, website, link = row
    scraping_tasks_website = scraping_tasks[website]
    HTML_file = f'{save_folder}/{website}.html'
    prompts[website] = {}
    for j, scraping_task in enumerate(scraping_tasks_website):
        # scraping_task = scraping_task.strip()
        solution = solutions[website][str(j)]
        HTML_elements = extracted_info[website]
        prompts[website][str(j)] = user_prompt.format(website=website, HTML_file=HTML_file, category=category, HTML_string=HTML_elements, task=scraping_task)

# Generate training samples
training_samples = {}
sample_template = """### System:
{system_prompt}

### User:
{user_prompt}

### Response:
```
{response}
```
"""
system_prompt = open('prompts/generate_solution_code_system_prompt.txt', 'r').read()
for website in prompts:
    training_samples[website] = {}
    for task in prompts[website]:
        sample = sample_template.format(system_prompt=system_prompt, user_prompt=prompts[website][task], response=solutions[website][task])
        training_samples[website][task] = sample

# Convert prompts, solutions and training samples to lists
prompts_list = []
solutions_list = []
training_samples_list = []
for website in prompts:
    for task in prompts[website]:
        prompts_list.append(prompts[website][task])
        solutions_list.append(solutions[website][task])
        training_samples_list.append(training_samples[website][task])

# Combine to a list of dictionaries
data = []
for i in range(len(prompts_list)):
    data.append({
        'prompt': prompts_list[i],
        'solution': solutions_list[i],
        'training_sample': training_samples_list[i]
    })

# Save prompts, solutions and training samples as a json file
# where each 'row' is a prompt, solution and training sample
# for a particular website and task
with open('data.json', 'w') as f:
    json.dump(data, f)