# System dependencies

In [1]:
# Run these in terminal, or do it here (may require sudo password)
# ! pip install langchain-openai langchain langchain_text_splitters tiktoken playwright beautifulsoup4 
# ! playwright install-deps
# ! playwright install

# Set env var OPENAI_API_KEY or load from a .env file:
import dotenv
dotenv.load_dotenv('../../.env')

False

# Load HTML
The next code block resolves some async issues as described in https://github.com/langchain-ai/langchain/issues/9014

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

In [4]:
from langchain.chains import create_extraction_chain

category_schema = { # Multiple per page (should only appear in first split)
    "properties": {
        "Dose": {"type": "string"},
        "Method": {"type": "string"},
        "Substance": {"type": "string"},
    },
    "required": ["Substance", "Dose", "Method"],
}

prop_schema = { # One per page (only in first split)
    "properties": {
        "Author": {"type": "string"},
        "Body Weight": {"type": "string"},
    }
}


def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).invoke(content)

In [5]:
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify what the author is experiencing. Do not mention the substance name in your response. 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce, avoids clustering because of the substance name
reduce_template = """The following is set of summaries:
{docs}
Imagine you are retelling the story to someone in an elevator and want to hide the fact that the author is using psychedelics, you only have 30 seconds. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)   

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [6]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter


def scrape_erowid(sub_id, prop_schema, category_schema):
    result = {}
    urls = ["https://www.erowid.org/experiences/exp.php?ID=" + str(sub_id)]
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, unwanted_tags=["script", "style", "exclude"], tags_to_extract=["div", "table"], 
    )
    
    print("Extracting ID: " + str(sub_id))

    # Split into chunks of 2000
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=2000, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)

    if not splits or len(splits) == 0:
        return None

    # Process the categories (first split as it's at the top)
    result["substances"] = extract(schema=category_schema, content=splits[0].page_content)["text"]
    
    # Process props (first split)
    result["meta"] = extract(schema=prop_schema, content=splits[0].page_content)["text"]

    # Map-reduce text to ensure token count isnt exceeded
    result["experience"] = map_reduce_chain.invoke(splits)["output_text"]

    return result


In [9]:
import json

for sub_id in range(300, 400):
    extracted_data = scrape_erowid(sub_id=sub_id, prop_schema=prop_schema, category_schema=category_schema)

    if not extracted_data: continue # Some were removed

    out_file = open("data/erowid/" + str(sub_id) +".json", "w") 
  
    json.dump(extracted_data, out_file, indent = 6) 

Extracting ID: 300
Extracting ID: 301
Extracting ID: 302
Extracting ID: 303
Extracting ID: 304
Extracting ID: 305
Extracting ID: 306
Extracting ID: 307
Extracting ID: 308
Extracting ID: 309
Extracting ID: 310
Extracting ID: 311
Extracting ID: 312
Extracting ID: 313
Extracting ID: 314
Extracting ID: 315
Extracting ID: 316
Extracting ID: 317
Extracting ID: 318
Extracting ID: 319
Extracting ID: 320
Extracting ID: 321
Extracting ID: 322
Extracting ID: 323
Extracting ID: 324
Extracting ID: 325
Extracting ID: 326
Extracting ID: 327
Extracting ID: 328
Extracting ID: 329
Extracting ID: 330
Extracting ID: 331
Extracting ID: 332
Extracting ID: 333
Extracting ID: 334
Extracting ID: 335
Extracting ID: 336
Extracting ID: 337
Extracting ID: 338
Extracting ID: 339
Extracting ID: 340
Extracting ID: 341
Extracting ID: 342
Extracting ID: 343
Extracting ID: 344
Extracting ID: 345
Extracting ID: 346
Extracting ID: 347
Extracting ID: 348
Extracting ID: 349
Extracting ID: 350
Extracting ID: 351
Extracting I