# Workplace Search Web Scraper Test Notebook

Notebook demonstrating how to use scrapy and requests to scrape web pages (in this example, from `gov.uk`), summarise them with an llm using langchain, then index them into Elasticsearch for querying.


## Import Libraries


In [36]:
import requests
import tiktoken
import json
import time
from getpass import getpass

from scrapy.http import HtmlResponse
from tqdm.notebook import tqdm

from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredHTMLLoader, UnstructuredURLLoader
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from unstructured.cleaners.core import (
    clean,
    clean_extra_whitespace,
    group_broken_paragraphs,
    remove_punctuation,
)
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_dict

## Define Functions


In [37]:
def scrapy_get_html(url: str) -> str:
    """
    Make an HTTP request to the given URL and extract specific data from the HTML response using scrapy.

    Args:
        url (str): The URL to scrape.

    Returns:
        dict: A dictionary containing the extracted data.
            - main_content_html (str): HTML content of the main section.
            - page_title (str): Title of the page.
            - updated (str): Updated timestamp of the page.
            - created (str): First published timestamp of the page.
    """
    # Make the HTTP request
    res = requests.get(url)

    # Check if the request was successful
    if res.status_code == 200:
        # Create an HtmlResponse directly from the response
        html_response = HtmlResponse(url, body=res.text, encoding="utf-8")

        return {
            "main_content_html": html_response.css(
                'main#content div[data-module="govspeak"]'
            ).get(),
            "page_title": html_response.css("main#content h1::text").get(),
            "updated": html_response.css(
                'head meta[name="govuk:updated-at"][content]::attr(content)'
            ).get(),
            "created": html_response.css(
                'head meta[name="govuk:first-published-at"][content]::attr(content)'
            ).get(),
        }
    else:
        print(f"Failed to fetch {url}. Status code: {res.status_code}")
        return None

In [38]:
def num_tokens_from_string(string: str, model_name: str) -> int:
    """
    Calculates the number of tokens in a given string using the specified model.

    Args:
        string: The input string.
        model_name: The name of the tokenizer model.

    Returns:
        The number of tokens in the string.
    """
    # Get the encoding for the specified model
    encoding = tiktoken.encoding_for_model(model_name)

    # Return the number of tokens
    return len(encoding.encode(string))

In [39]:
def generate_document(url: str) -> Document:
    """
    Generates a langchain document object from a given URL.

    Args:
        url (str): The URL of the webpage.

    Returns:
        Document: The generated document object.
    """
    # Get the HTML content of the webpage
    response = scrapy_get_html(url)

    # Extract the page title and remove extra whitespace
    page_title = clean_extra_whitespace(response["page_title"])

    # Partition the HTML into elements and convert them to a dictionary
    elements = partition_html(text=response["main_content_html"])
    elements_dict = convert_to_dict(elements)

    # Filter the elements to include only NarrativeText, ListItem, and Table types
    content = [
        e
        for e in elements_dict
        if e["type"] == "NarrativeText"
        or e["type"] == "ListItem"
        or e["type"] == "Table"
    ]

    # Combine the text from the content elements and remove extra whitespace
    full_clean = clean_extra_whitespace(" ".join([e["text"] for e in content]))

    # Create a Document object with the page content and metadata
    return Document(
        page_content=full_clean,
        metadata={
            "url": url,
            "name": page_title,
            "created_on": response["created"],
            "updated_at": response["updated"],
        },
    )

In [40]:
def summarize_document(
    doc: Document,
    model_name: str,
    model_max_tokens: int,
    openai_key: str,
    verbose: bool = False,
) -> str:
    """
    Summarizes a document using the ChatOpenAI model.

    Args:
        doc (Document): The document to summarize.
        model_name (str): The name of the ChatOpenAI model.
        model_max_tokens (int): The maximum number of tokens allowed by the model.
        openai_key (str): The API key for OpenAI.
        verbose (bool, optional): Whether to print verbose output. Defaults to False.

    Returns:
        str: The summarized document.
    """

    llm = ChatOpenAI(model_name=model_name, temperature=0, openai_api_key=openai_key)

    prompt_template = """Write a concise summary of the following: {text}"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

    num_tokens = num_tokens_from_string(doc.page_content, model_name)

    if num_tokens < model_max_tokens:
        # Use the 'stuff' chain for smaller documents
        chain = load_summarize_chain(
            llm, chain_type="stuff", prompt=prompt, verbose=verbose
        )

    else:
        # Use the 'map_reduce' chain for larger documents
        chain = load_summarize_chain(
            llm,
            chain_type="map_reduce",
            map_prompt=prompt,
            combine_prompt=prompt,
            verbose=verbose,
        )

    summary = chain.run([doc])

    return clean_extra_whitespace(summary)

In [41]:
def doc_to_dict(doc: Document, summary: str) -> dict:
    """
    Convert a Document object to a dictionary.

    Args:
        doc (Document): The Document object to convert.
        summary (str): The summary for the document.

    Returns:
        dict: The converted dictionary representation of the Document object.
    """
    return {
        "content": doc.page_content,  # The content of the document
        "summary": summary,  # The summary for the document
        "name": doc.metadata["name"],  # The name of the document
        "url": doc.metadata["url"],  # The URL of the document
        "created_on": doc.metadata["created_on"],  # The date the document was created
        "updated_at": doc.metadata[
            "updated_at"
        ],  # The date the document was last updated
        "category": "hmrc",  # The category of the document
        "_run_ml_inference": True,  # Flag to indicate whether to run ML inference
        "rolePermissions": ["demo", "manager"],  # The role permissions for the document
    }

In [42]:
def convert_to_serializable(obj):
    """
    Converts the given object to a JSON-serializable format.

    Args:
        obj: The object to be converted.

    Returns:
        The JSON-serializable version of the object.
    """
    if isinstance(obj, str):
        return obj.encode("unicode_escape").decode("utf-8")
    return obj

## Define some target URLs


In [43]:
capital_gains_urls = [
    "https://www.gov.uk/capital-gains-tax",
    "https://www.gov.uk/capital-gains-tax/what-you-pay-it-on",
    "https://www.gov.uk/capital-gains-tax/allowances",
    "https://www.gov.uk/capital-gains-tax/gifts",
    "https://www.gov.uk/capital-gains-tax/work-out-need-to-pay",
    "https://www.gov.uk/capital-gains-tax/reporting-and-paying-capital-gains-tax",
    "https://www.gov.uk/capital-gains-tax/rates",
    "https://www.gov.uk/capital-gains-tax/losses",
    "https://www.gov.uk/capital-gains-tax/records",
    "https://www.gov.uk/capital-gains-tax/market-value",
]

inheritance_tax_urls = [
    "https://www.gov.uk/inheritance-tax",
    "https://www.gov.uk/inheritance-tax/passing-on-home",
    "https://www.gov.uk/inheritance-tax/gifts",
    "https://www.gov.uk/inheritance-tax/when-someone-living-outside-the-uk-dies",
    "https://www.gov.uk/paying-inheritance-tax",
    "https://www.gov.uk/paying-inheritance-tax/get-a-reference-number",
    "https://www.gov.uk/paying-inheritance-tax/approve-a-payment"
    "https://www.gov.uk/paying-inheritance-tax/bank-details-for-online-or-telephone-banking-chaps-bacs",
    "https://www.gov.uk/paying-inheritance-tax/bank-or-building-society",
    "https://www.gov.uk/paying-inheritance-tax/by-post",
    "https://www.gov.uk/paying-inheritance-tax/deceaseds-bank-account",
    "https://www.gov.uk/paying-inheritance-tax/british-government-stock",
    "https://www.gov.uk/paying-inheritance-tax/by-transferring-national-heritage-property",
    "https://www.gov.uk/paying-inheritance-tax/yearly-instalments",
    "https://www.gov.uk/paying-inheritance-tax/trusts",
    "https://www.gov.uk/paying-inheritance-tax/pay-early",
    "https://www.gov.uk/tax-property-money-shares-you-inherit",
    "https://www.gov.uk/tax-property-money-shares-you-inherit/money-and-shares",
    "https://www.gov.uk/tax-property-money-shares-you-inherit/property",
    "https://www.gov.uk/tax-property-money-shares-you-inherit/joint-property-shares-bank-accounts",
    "https://www.gov.uk/valuing-estate-of-someone-who-died",
    "https://www.gov.uk/valuing-estate-of-someone-who-died/identify-assets-debts",
    "https://www.gov.uk/valuing-estate-of-someone-who-died/estimate-estate-value",
    "https://www.gov.uk/valuing-estate-of-someone-who-died/check-type-of-estate",
    "https://www.gov.uk/valuing-estate-of-someone-who-died/inheritance-tax-to-pay",
    "https://www.gov.uk/valuing-estate-of-someone-who-died/records",
    "https://www.gov.uk/guidance/work-out-what-part-of-your-estate-pays-inheritance-tax",
    "https://www.gov.uk/guidance/check-if-you-can-get-an-additional-inheritance-tax-threshold",
    "https://www.gov.uk/guidance/inheritance-tax-residence-nil-rate-band",
    "https://www.gov.uk/guidance/inheritance-tax-transfer-of-threshold",
    "https://www.gov.uk/business-relief-inheritance-tax",
    "https://www.gov.uk/business-relief-inheritance-tax/what-qualifies-for-business-relief",
    "https://www.gov.uk/business-relief-inheritance-tax/give-away-business-property-or-assets",
    "https://www.gov.uk/guidance/agricultural-relief-on-inheritance-tax",
    "https://www.gov.uk/guidance/inheritance-tax-double-taxation-relief",
]

probate_urls = [
    "https://www.gov.uk/applying-for-probate",
    "https://www.gov.uk/applying-for-probate/if-theres-a-will",
    "https://www.gov.uk/applying-for-probate/if-theres-not-a-will",
    "https://www.gov.uk/applying-for-probate/before-you-apply",
    "https://www.gov.uk/applying-for-probate/fees",
    "https://www.gov.uk/applying-for-probate/apply-for-probate",
    "https://www.gov.uk/applying-for-probate/after-youve-applied",
]

tax_urls = [
    "https://www.gov.uk/income-tax",
    "https://www.gov.uk/income-tax/how-you-pay-income-tax",
    "https://www.gov.uk/income-tax/taxfree-and-taxable-state-benefits",
    "https://www.gov.uk/income-tax/find-out-if-you-need-to-pay-income-tax",
    "https://www.gov.uk/income-tax/check-youre-paying-the-right-amount",
    "https://www.gov.uk/income-tax-rates",
    "https://www.gov.uk/income-tax-rates/previous-tax-years",
    "https://www.gov.uk/income-tax-rates/income-over-100000",
    "https://www.gov.uk/self-assessment-tax-returns",
    "https://www.gov.uk/self-assessment-tax-returns/who-must-send-a-tax-return",
    "https://www.gov.uk/self-assessment-tax-returns/sending-return",
    "https://www.gov.uk/self-assessment-tax-returns/no-longer-need-to-send-a-tax-return",
    "https://www.gov.uk/self-assessment-tax-returns/deadlines",
    "https://www.gov.uk/self-assessment-tax-returns/penalties",
    "https://www.gov.uk/self-assessment-tax-returns/corrections",
    "https://www.gov.uk/self-assessment-tax-returns/get-help",
    "https://www.gov.uk/self-assessment-tax-returns/returns-for-someone-who-has-died",
    "https://www.gov.uk/simple-assessment",
    "https://www.gov.uk/check-income-tax-current-year",
    "https://www.gov.uk/check-income-tax-last-year",
    "https://www.gov.uk/check-simple-assessment",
    "https://www.gov.uk/check-additional-income-tax",
    "https://www.gov.uk/tax-on-dividends",
    "https://www.gov.uk/apply-tax-free-interest-on-savings",
    "https://www.gov.uk/apply-tax-free-interest-on-savings/previous-tax-years",
    "https://www.gov.uk/tax-on-dividends/previous-tax-years",
    "https://www.gov.uk/tax-buy-shares",
    "https://www.gov.uk/tax-buy-shares/buy-shares-electronically",
    "https://www.gov.uk/tax-buy-shares/use-a-stock-transfer-form",
    "https://www.gov.uk/tax-buy-shares/special-share-arrangements",
    "https://www.gov.uk/tax-company-benefits",
    "https://www.gov.uk/tax-company-benefits/tax-on-company-cars",
    "https://www.gov.uk/tax-company-benefits/other-company-benefits-youll-pay-tax-on",
    "https://www.gov.uk/tax-company-benefits/taxfree-company-benefits",
    "https://www.gov.uk/tax-company-benefits/national-insurance-on-company-benefits",
    "https://www.gov.uk/tax-company-benefits/keeping-records-and-reporting-changes",
    "https://www.gov.uk/tax-employee-share-schemes/share-incentive-plans-sips",
    "https://www.gov.uk/tax-employee-share-schemes/save-as-you-earn-saye",
    "https://www.gov.uk/tax-employee-share-schemes/company-share-option-plan",
    "https://www.gov.uk/tax-employee-share-schemes/enterprise-management-incentives-emis",
    "https://www.gov.uk/tax-employee-share-schemes/employee-shareholder-shares",
    "https://www.gov.uk/tax-employee-share-schemes/transferring-your-shares-to-an-isa",
]

tax_forms_urls = [
    "https://www.gov.uk/paye-forms-p45-p60-p11d",
    "https://www.gov.uk/paye-forms-p45-p60-p11d/p60",
    "https://www.gov.uk/paye-forms-p45-p60-p11d/p11d",
    "https://www.gov.uk/paye-forms-p45-p60-p11d/lost-paye-forms",
    "https://www.gov.uk/payslips",
]

abroad_urls = [
    "https://www.gov.uk/tax-right-retire-abroad-return-to-uk",
    "https://www.gov.uk/state-pension-if-you-retire-abroad",
    "https://www.gov.uk/state-pension-if-you-retire-abroad/rates-of-state-pension",
    "https://www.gov.uk/state-pension-if-you-retire-abroad/tax-on-your-state-pension",
    "https://www.gov.uk/state-pension-if-you-retire-abroad/report-a-change-in-your-circumstances",
    "https://www.gov.uk/national-insurance-if-you-go-abroad",
    "https://www.gov.uk/guidance/national-insurance-for-workers-from-the-uk-working-in-the-eea-or-switzerland",
    "https://www.gov.uk/moving-or-retiring-abroad",
    "https://www.gov.uk/tax-uk-income-live-abroad",
    "https://www.gov.uk/tax-uk-income-live-abroad/rent",
    "https://www.gov.uk/tax-uk-income-live-abroad/selling-or-inheriting-assets",
    "https://www.gov.uk/tax-uk-income-live-abroad/personal-allowance",
    "https://www.gov.uk/tax-uk-income-live-abroad/taxed-twice",
    "https://www.gov.uk/tax-uk-income-live-abroad/uk-resident",
]

foreign_income_urls = [
    "https://www.gov.uk/tax-foreign-income",
    "https://www.gov.uk/tax-foreign-income/residence",
    "https://www.gov.uk/tax-foreign-income/non-domiciled-residents",
    "https://www.gov.uk/tax-foreign-income/paying-tax",
    "https://www.gov.uk/tax-foreign-income/foreign-income-thats-taxed-differently",
    "https://www.gov.uk/tax-foreign-income/taxed-twice",
    "https://www.gov.uk/tax-foreign-income/study-in-the-uk",
]

national_insurance_urls = [
    "https://www.gov.uk/voluntary-national-insurance-contributions",
    "https://www.gov.uk/voluntary-national-insurance-contributions/who-can-pay-voluntary-contributions",
    "https://www.gov.uk/voluntary-national-insurance-contributions/rates",
    "https://www.gov.uk/voluntary-national-insurance-contributions/deadlines",
    "https://www.gov.uk/national-insurance",
    "https://www.gov.uk/national-insurance/your-national-insurance-number",
    "https://www.gov.uk/national-insurance/national-insurance-classes",
    "https://www.gov.uk/national-insurance/how-much-you-pay",
    "https://www.gov.uk/national-insurance/what-national-insurance-is-for",
    "https://www.gov.uk/national-insurance/help-if-youre-not-working",
    "https://www.gov.uk/national-insurance/change-of-circumstance",
]

tax_relief_urls = [
    "https://www.gov.uk/tax-relief-for-employees",
    "https://www.gov.uk/tax-relief-for-employees/working-at-home",
    "https://www.gov.uk/tax-relief-for-employees/uniforms-work-clothing-and-tools",
    "https://www.gov.uk/tax-relief-for-employees/vehicles-you-use-for-work",
    "https://www.gov.uk/tax-relief-for-employees/professional-fees-and-subscriptions",
    "https://www.gov.uk/tax-relief-for-employees/travel-and-overnight-expenses",
    "https://www.gov.uk/tax-relief-for-employees/buying-other-equipment",
]

student_loans_urls = [
    "https://www.gov.uk/repaying-your-student-loan",
    "https://www.gov.uk/repaying-your-student-loan/which-repayment-plan-you-are-on",
    "https://www.gov.uk/repaying-your-student-loan/when-you-start-repaying",
    "https://www.gov.uk/repaying-your-student-loan/what-you-pay",
    "https://www.gov.uk/repaying-your-student-loan/how-you-repay",
    "https://www.gov.uk/repaying-your-student-loan/make-extra-repayments",
    "https://www.gov.uk/repaying-your-student-loan/getting-a-refund",
    "https://www.gov.uk/repaying-your-student-loan/when-your-student-loan-gets-written-off-or-cancelled",
    "https://www.gov.uk/repaying-your-student-loan/update-your-employment-details",
]

## Bring it all together and output some JSON documents


In [44]:
# model_name = "gpt-3.5-turbo"
model_name = "gpt-4"
model_max_tokens = 4097

password = getpass("Enter your OpenAI API key: ")

docs = []
# urls = (
#     capital_gains_urls
#     + inheritance_tax_urls
#     + probate_urls
#     + tax_urls
#     + tax_forms_urls
#     + abroad_urls
#     + foreign_income_urls
# )

urls = ["https://www.gov.uk/capital-gains-tax"]

for url in tqdm(urls):
    # initialise a new document from the url
    doc = generate_document(url)

    # generate a summary from the page_content
    summary = summarize_document(
        doc=doc,
        model_name=model_name,
        model_max_tokens=model_max_tokens,
        openai_key=openai_key,
    )

    # generate a dict from the doc and summary, append to list
    docs.append(doc_to_dict(doc, summary))
    time.sleep(5)  # enable for live running

# Specify the file path where you want to save the JSON
file_path = "data.json"

# Open the file in write mode and use json.dump to write the list of dictionaries to the file
with open(file_path, "w") as json_file:
    json.dump(
        docs,
        default=convert_to_serializable,
        ensure_ascii=False,
        indent=4,
        fp=json_file,
    )

  0%|          | 0/1 [00:00<?, ?it/s]