# System dependencies

In [1]:
# Run these in terminal, or do it here (may require sudo password)
# ! pip install langchain-openai langchain langchain_text_splitters tiktoken playwright beautifulsoup4 
# ! playwright install-deps
# ! playwright install

# Set env var OPENAI_API_KEY or load from a .env file:
import dotenv
dotenv.load_dotenv('../../.env')

False

# Load HTML
The next code block resolves some async issues as described in https://github.com/langchain-ai/langchain/issues/9014

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

In [4]:
from langchain.chains import create_extraction_chain

category_schema = { # Multiple per page (should only appear in first split)
    "properties": {
        "Dose": {"type": "string"},
        "Method": {"type": "string"},
        "Substance": {"type": "string"},
    },
    "required": ["Substance"],
}

prop_schema = { # One per page (only in first split)
    "properties": {
        "Author": {"type": "string"},
        "Body Weight": {"type": "string"},
    }
}

text_schema = { # Can be multiple per page
    "properties": {
        "Full Text Body": {"type": "string"},
    },
    "required": ["Full Text Body"],
}


def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).invoke(content)

In [5]:
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the experiences being described. 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """The following is set of summaries:
{docs}
Take these and distill it into a final, consolidated summary of the experience being described.
Do not mention the name of the substance being used. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)   

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [6]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter


def scrape_erowid(sub_id, prop_schema, category_schema):
    result = {}
    urls = ["https://www.erowid.org/experiences/exp.php?ID=" + str(sub_id)]
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, unwanted_tags=["script", "style", "exclude"], tags_to_extract=["div", "table"], 
    )
    
    print("Extracting ID: " + str(sub_id))

    # Split into chunks of 2000
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=2000, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)

    # Process the categories (first split as it's at the top)
    result["substances"] = extract(schema=category_schema, content=splits[0].page_content)["text"]
    
    # Process props (first split)
    result["meta"] = extract(schema=prop_schema, content=splits[0].page_content)["text"]

    # Map-reduce text to ensure token count isnt exceeded
    result["experience"] = map_reduce_chain.invoke(splits)["output_text"]

    return result


In [7]:
import json

for sub_id in range(1, 100):
    out_file = open("data/erowid/" + str(sub_id) +".json", "w") 
  
    json.dump(scrape_erowid(sub_id=sub_id, prop_schema=prop_schema, category_schema=category_schema), out_file, indent = 6) 

Extracting ID: 1
Extracting ID: 2
Extracting ID: 3
Extracting ID: 4
Extracting ID: 5
Extracting ID: 6
Extracting ID: 7
