In [261]:
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import BeautifulSoupTransformer
import re

In [311]:
base_url = "https://genshin-impact.fandom.com"
urls = ["https://genshin-impact.fandom.com/wiki/Character/List"]
loader = AsyncHtmlLoader(urls)
html = loader.load()

Fetching pages: 100%|############################################################################################################| 1/1 [00:00<00:00, 15.14it/s]


In [312]:
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(
    html, tags_to_extract=["table"]
)

In [318]:
wiki = [item[1:-1] for item in set(docs_transformed[0].page_content.split()) if "wiki" in item]
search_url = base_url
for path in wiki:
    search_url += path + "/Lore"  if 'furina' in path.lower() else ""
search_url

'https://genshin-impact.fandom.com/wiki/Furina/Lore'

In [353]:
loader = AsyncHtmlLoader(search_url)
html = loader.load()

Fetching pages: 100%|############################################################################################################| 1/1 [00:00<00:00, 12.69it/s]


In [354]:
docs_transformed = bs_transformer.transform_documents(
    html, tags_to_extract=["p"]
)
len(docs_transformed[0].page_content)

26225

### Convert to Vector Store

In [362]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import AsyncHtmlLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS

In [363]:
loader = AsyncHtmlLoader(search_url)
documents = loader.load()

Fetching pages: 100%|############################################################################################################| 1/1 [00:00<00:00, 14.29it/s]


In [364]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
texts = text_splitter.split_documents(documents=documents)

In [366]:
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2', model_kwargs= {'device': 'cpu'})

  from .autonotebook import tqdm as notebook_tqdm


In [368]:
DB_FAISS_PATH = "vectorstores/db_faiss"

db = FAISS.from_documents(texts, embeddings)
db.save_local(DB_FAISS_PATH)