# System dependencies

In [1]:
# Run these in terminal, or do it here (may require sudo password)
# ! pip install -q langchain-openai langchain langchain_text_splitters tiktoken playwright beautifulsoup4 
# ! playwright install-deps
# ! playwright install

# Set env var OPENAI_API_KEY or load from a .env file:
import dotenv
dotenv.load_dotenv('../../.env')

False

# Load HTML
The next code block resolves some async issues as described in https://github.com/langchain-ai/langchain/issues/9014

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

In [4]:
from langchain.chains import create_extraction_chain

category_schema = {
    "properties": {
        "Dose": {"type": "string"},
        "Method": {"type": "string"},
        "Substance": {"type": "string"},
    },
    "required": ["Substance"],
}

text_schema = {
    "properties": {
        "Author": {"type": "string"},
        "Body Weight": {"type": "string"},
        "Full Text Body": {"type": "string"},
    },
    "required": ["Full Text Body"],
}


def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).invoke(content)

In [15]:
import pprint # pretty print

from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter


def scrape_with_playwright(urls, text_schema, category_schema):
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, unwanted_tags=["script", "style", "exclude"], tags_to_extract=["div", "table"], 
    )
    
    print("Extracting content with LLM")

    # Grab the first 1000 tokens of the site
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=2000, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)

    # Process the first split
    for split in splits:
        extracted_text = extract(schema=text_schema, content=split.page_content)
        extracted_categories = extract(schema=category_schema, content=split.page_content)
        pprint.pprint(extracted_text["text"])
        pprint.pprint(extracted_categories["text"])


urls = ["https://www.erowid.org/experiences/exp.php?ID=117352", "https://www.erowid.org/experiences/exp.php?ID=89042"]
extracted_content = scrape_with_playwright(urls, text_schema=text_schema, category_schema=category_schema)

Extracting content with LLM
[{'Author': 'Sally_Queenz',
  'Body Weight': '67 kg',
  'Full Text Body': 'Yesterday I decided to take the first proper plunge into '
                    'combining dissociatives with psychedelics. Ive combined '
                    'O-PCE with 5-MeO-MiPT twice but I dont think the Moxy had '
                    'enough influence to call it a proper exploration of this '
                    'space. I will try my best to use timestamps but all of '
                    'this is coming from memory the next day so it might not '
                    'be the most accurate.  The doses here are what works for '
                    'me personally, 15mg of 3-MeO-PCE is somewhat weak and '
                    'boring on its own, maybe a medium dose, because my '
                    'tolerance to dissos is pretty high. I also seem to not be '
                    'very sensitive to oral 2C-B, but maybe its just the food '
                    'in my stomach inhibiting the

KeyboardInterrupt: 