In [None]:
%pip install langchain langchain_openai nest_asyncio lxml beautifulsoup4 --upgrade

In [33]:
# Get the openai secret key:
import getpass

secret_key = getpass.getpass('Please enter your openai key:')

In [4]:
# fixes a bug with asyncio and jupyter
import nest_asyncio

nest_asyncio.apply()

In [25]:
from langchain.document_loaders.sitemap import SitemapLoader
from langchain_openai.chat_models import ChatOpenAI
from langchain.chains import openai_functions
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.output_parsers import StructuredOutputParser

import pandas as pd

In [18]:
sitemap_loader = SitemapLoader(web_path="https://www.google.com/gmail/sitemap.xml")
sitemap_loader.requests_per_second = 5
docs = sitemap_loader.load()

Fetching pages: 100%|##########| 166/166 [00:09<00:00, 17.20it/s]


In [34]:
# Schema
schema = {
    "properties": {
        "url": {"type": "string", "description": "The URL of the page."},
        "lastmod": {"type": "string", "description": "The last modification date of the page."},
        "changefreq": {"type": "string", "description": "How frequently the page is likely to change."},
        "priority": {"type": "number", "description": "The priority of the page relative to other pages on the site."},
    },
    "required": ["url", "lastmod", "changefreq", "priority"],
}


# LLM
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", api_key=secret_key)
chain = openai_functions.create_tagging_chain(schema, llm, output_key="output")



In [None]:
results = []

# Remove the 0:10 to run on all documents:
for doc in docs[0:10]:
    print(doc)
    chain_result = chain.invoke({'input': doc.page_content})
    results.append(chain_result['output'])

In [36]:
results

[{'url': 'https://www.google.com/gmail/',
  'lastmod': '2023-10-01',
  'changefreq': 'daily',
  'priority': 1},
 {'url': 'https://workspace.google.com/products/gmail/',
  'lastmod': '2023-10-01',
  'changefreq': 'monthly',
  'priority': 1},
 {'url': 'https://www.google.com/gmail/about/',
  'lastmod': '2023-10-01',
  'changefreq': 'monthly',
  'priority': 1},
 {'url': 'https://www.google.com/gmail/',
  'lastmod': '2023-10-01',
  'changefreq': 'daily',
  'priority': 1},
 {'url': 'https://workspace.google.com/products/gmail/',
  'lastmod': '2023-10-01',
  'changefreq': 'monthly',
  'priority': 1},
 {'url': 'https://www.google.com/gmail/about/',
  'lastmod': '2023-10-01',
  'changefreq': 'monthly',
  'priority': 1},
 {'url': 'https://www.google.com/gmail/',
  'lastmod': '2023-10-01',
  'changefreq': 'daily',
  'priority': 1},
 {'url': 'https://workspace.google.com/products/gmail/',
  'lastmod': '2023-10-01',
  'changefreq': 'monthly',
  'priority': 1},
 {'url': 'https://www.google.com/gmai

In [None]:
# Convert the results to a pandas dataframe
df = pd.DataFrame(results)

In [39]:
# Combine the URLs with the results
df['url'] = [doc.metadata['source'] for doc in docs[0:10]]

In [40]:
df

Unnamed: 0,url,lastmod,changefreq,priority
0,https://www.google.com/intl/am/gmail/about/,2023-10-01,daily,1
1,https://www.google.com/intl/am/gmail/about/for...,2023-10-01,monthly,1
2,https://www.google.com/intl/am/gmail/about/pol...,2023-10-01,monthly,1
3,https://www.google.com/intl/ar/gmail/about/,2023-10-01,daily,1
4,https://www.google.com/intl/ar/gmail/about/for...,2023-10-01,monthly,1
5,https://www.google.com/intl/ar/gmail/about/pol...,2023-10-01,monthly,1
6,https://www.google.com/intl/bg/gmail/about/,2023-10-01,daily,1
7,https://www.google.com/intl/bg/gmail/about/for...,2023-10-01,monthly,1
8,https://www.google.com/intl/bg/gmail/about/pol...,2023-10-01,monthly,1
9,https://www.google.com/intl/bn/gmail/about/,2023-10-01,daily,1
