In [11]:
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import BeautifulSoupTransformer
import re
import torch

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [74]:
base_url = "https://genshin-impact.fandom.com"
urls = ["https://genshin-impact.fandom.com/wiki/Character/List"]
loader = AsyncHtmlLoader(urls)
html = loader.load()

Fetching pages: 100%|#############################################################################################################################| 1/1 [00:00<00:00, 12.85it/s]


In [75]:
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(
    html, tags_to_extract=["table"]
)

In [76]:
wiki = [item[1:-1] for item in set(docs_transformed[0].page_content.split()) if "wiki" in item]
search_url = base_url
for path in wiki:
    search_url += path + "/Lore"  if 'furina' in path.lower() else ""
search_url

'https://genshin-impact.fandom.com/wiki/Furina/Lore'

In [77]:
loader = AsyncHtmlLoader(search_url)
html = loader.load()

Fetching pages: 100%|#############################################################################################################################| 1/1 [00:00<00:00, 22.57it/s]


In [78]:
docs_transformed = bs_transformer.transform_documents(
    html
)
len(docs_transformed[0].page_content)

102612

### Convert to Vector Store

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
from langchain.document_loaders import AsyncHtmlLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS

In [37]:
loader = AsyncHtmlLoader(search_url)
documents = loader.load()

Fetching pages: 100%|#############################################################################################################################| 1/1 [00:00<00:00, 11.29it/s]


In [38]:
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2', model_kwargs= {'device': device})

### Using RecursiveCharacterTextSplitter

In [39]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
texts = text_splitter.split_documents(documents=documents)

In [41]:
len(texts)

960

### Using HTMLHeaderTextSplitter

In [117]:
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

In [118]:
text_splitter_html = HTMLHeaderTextSplitter(headers_to_split_on)
texts_html = text_splitter_html.split_text(documents[0].page_content)

In [119]:
len(texts_html)

36

In [156]:
def isInfo(metadata):
    return len(metadata) != 0

In [163]:
refined_documents = []
for i in range(len(texts_html[:24])):
    if isInfo(texts_html[i].metadata):
        refined_documents.append(texts_html[i])
len(refined_documents)

22

In [151]:
print(texts_html[34].page_content)

Media Kit Contact


### Save into vectorstores/db_faiss

In [368]:
DB_FAISS_PATH = "vectorstores/db_faiss"

db = FAISS.from_documents(texts, embeddings)
db.save_local(DB_FAISS_PATH)