In [None]:
import requests
from bs4 import BeautifulSoup
import re
import PyPDF2
import os
import uuid
import docx2txt
from subprocess import Popen, PIPE

main_page = "https://sites.ed.gov/idea/policy-guidance/"
response = requests.get(main_page)
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
file_items = soup.find_all("div", class_="idea-file-item")

def get_pdf_text(pdf_link):
    response = requests.get(pdf_link)
    if response.status_code != 200:
        print(f"Could not download PDF from {pdf_link}")
        return "Could not download PDF"
    random_filename = str(uuid.uuid4())
    with open(random_filename, "wb") as f:
        f.write(response.content)
    with open(random_filename, "rb") as f:
        try:
            pdf = PyPDF2.PdfReader(f)
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
        except:
            text = "Could not extract text from PDF"
    os.remove(random_filename)
    return text

def get_doc_text(doc_link):
    response = requests.get(doc_link)
    if response.status_code != 200:
        print(f"Could not download document from {doc_link}")
        return "Could not download document"
    try:
        random_filename = str(uuid.uuid4())
        with open(random_filename, "wb") as f:
            f.write(response.content)
        if doc_link.lower().endswith(".doc"):
            os.rename(random_filename, random_filename + ".doc")
            process = Popen(["antiword", random_filename+".doc"], stdout=PIPE)
            text, err = process.communicate()
            text = text.decode("utf-8", "ignore")
            #get rid of everything before filter : Text\n\ufeff\n
            index = text.find("filter : Text\n\ufeff")
            if index != -1:
                text = text[index+15:]
        elif doc_link.lower().endswith(".docx"):
            text = docx2txt.process(random_filename)
        os.remove(random_filename+".doc")
        return text
    except:
        print(f"Could not extract text from document {doc_link}")
        return "Could not extract text from document"

    

    
def process_file_item(file_item):
    title = file_item.find("h3").text
    link = file_item.find("h3").find("a")["href"]
    topic_area = file_item.find("div", class_="topic-area-list").text
    topic_area = re.sub(r"Topic Areas: ", "", topic_area)
    description = file_item.find("div", class_="description").text
    description = re.sub(r"Read More", "", description)
    all_links = file_item.find_all("a")
    pdf_links_titles = {link["href"]:link.text for link in all_links if link["href"].lower().endswith(".pdf")}
    doc_docx_links_titles = {link["href"]:link.text for link in all_links if link["href"].lower().endswith(".doc") or link["href"].lower().endswith(".docx")}
    if pdf_links_titles or doc_docx_links_titles:
        pdfs = [(link, pdf_links_titles[link], get_pdf_text(link)) for link in pdf_links_titles]
        doc_docxs = [(link, doc_docx_links_titles[link], get_doc_text(link)) for link in doc_docx_links_titles]
        return {"title": title, "link": link, "topic_area": topic_area, "description": description, "docs": pdfs+doc_docxs}
    else:
        link_soup = BeautifulSoup(requests.get(link).text, "html.parser")
        main = link_soup.find("main", class_="site-main")
        title = main.find("h1").text
        #remove the title from the main content
        main_content = main.find('div', class_="idea-file-item")
        main_content.find('div', class_="topic-area-list").decompose()
        main_text = main_content.text
        return {"title": title, "link": link, "topic_area": topic_area, "description": description, "docs": [(link, title, main_text)]}


In [None]:
from tqdm import tqdm

data = []
with tqdm(total=len(file_items)) as pbar:
    for file_item in file_items:
        result = process_file_item(file_item)
        data.append(result)
        pbar.update(1)

import json
with open("data.json", "w") as f:
    json.dump(data, f)

In [None]:
import json
with open("data.json", "r") as f:
    data = json.load(f)
#cleanup
#for every entry in data, we want to look at the docs and hopefully get rid of duplicates and other messy stuff
#we want to remove any documents that are less than 100 characters long
#then, if we see any 2 docs whose titles are the same except one ends in MS WORD and one ends in PDF then we only want to keep the MS WORD one
#then we look for documents with a fuzz score over 90 and check those out

for entry in data:
    docs = entry["docs"]
    new_docs = []
    for i in range(len(docs)):
        if len(docs[i][2]) < 100:
            continue
        if docs[i][1].lower().endswith("pdf"):
            if any(docs[j][1].lower().endswith("ms word") and docs[j][1].lower().replace("ms word", "").replace("word","").strip() == docs[i][1].lower().replace("pdf", "").strip() for j in range(len(docs))):
                continue
        new_docs.append(docs[i])
    entry["docs"] = new_docs

with open("data_cleaned.json", "w") as f:
    json.dump(data, f)

In [None]:
with open("data_cleaned.json", "r") as f:
    data = json.load(f)

from llama_index.core import Document
def get_documents(entry):
    #for each document in the entry, create a document object
    #metadata fields:
    #   parent_title (passed to embedder and LLM)
    #   document_title (passed to embedder and LLM)
    #   parent_link (not used)
    #   document_link (not used)
    #   topic_area (passed to embedder)
    #   description (passed to embedder)
    #text field: document text (we have this)
    parent_title = entry['title']
    parent_link = entry['link']
    topic_area = entry['topic_area']
    description = entry['description']
    documents = []
    for doc in entry['docs']:
        document_title = doc[1]
        document_text = doc[2]
        document = Document(text=document_text, metadata={"parent_title": parent_title, "document_title": document_title, "parent_link": parent_link, "document_link": doc[0], "topic_area": topic_area, "description": description})
        document.excluded_embed_metadata_keys = ["parent_link", "document_link"]
        document.excluded_llm_metadata_keys = ["parent_link", "document_link", "topic_area", "description"]
        documents.append(document)
    return documents

documents = sum([get_documents(entry) for entry in data], [])

In [None]:
#let's save the documents to disk
import pickle
with open("documents.pkl", "wb") as f:  
    pickle.dump(documents, f)