In [1]:
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import BeautifulSoupTransformer
import re
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [76]:
base_url = "https://genshin-impact.fandom.com"
urls = ["https://genshin-impact.fandom.com/wiki/Character/List"]
loader = AsyncHtmlLoader(urls)
html = loader.load()

Fetching pages: 100%|########################################| 1/1 [00:00<00:00, 31.98it/s]


In [68]:
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(
    html, tags_to_extract=["table"]
)

In [69]:
wiki = [item[1:-1] for item in set(docs_transformed[0].page_content.split()) if "wiki" in item]
search_url = base_url
for path in wiki:
    search_url += path + "/Lore"  if 'furina' in path.lower() else ""
search_url

'https://genshin-impact.fandom.com/wiki/Furina/Lore'

In [70]:
loader = AsyncHtmlLoader(search_url)
html = loader.load()

Fetching pages: 100%|########################################| 1/1 [00:00<00:00, 32.35it/s]


In [71]:
docs_transformed = bs_transformer.transform_documents(
    html
)
len(docs_transformed[0].page_content)

106778

### Convert to Vector Store

In [72]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
from langchain.document_loaders import AsyncHtmlLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS

In [73]:
loader = AsyncHtmlLoader(search_url)
documents = loader.load()

Fetching pages: 100%|########################################| 1/1 [00:00<00:00, 33.22it/s]


In [74]:
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2', model_kwargs= {'device': device})

### Using RecursiveCharacterTextSplitter

In [56]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
texts = text_splitter.split_documents(documents=documents)

In [52]:
for i in range(len(texts)):
    print(len(texts[i].page_content))

137
499
65
489
499
430
89
499
459
499
500
101
499
500
85
499
500
93
499
500
59
382
22
499
500
481
416
402
410
490
356
423
439
452
473
457
177
11
499
114
63
204
11
499
431
136
480
453
492
31
11
498
492
470
482
481
473
485
426
472
480
377
472
494
492
484
465
479
487
478
476
478
444
476
484
458
448
459
477
484
473
482
494
496
469
483
491
464
370
466
463
463
489
476
459
478
477
417
484
474
489
484
491
491
462
473
487
439
467
493
463
440
482
487
480
476
460
460
373
484
485
487
472
486
485
460
486
496
488
485
424
473
481
486
482
463
465
490
455
487
453
465
480
483
425
402
404
400
453
399
498
476
375
423
387
386
478
498
285
6
343
477
408
384
457
156
6
346
489
497
41
6
339
467
461
487
265
6
334
466
477
287
6
347
390
476
491
498
465
6
486
472
334
497
459
497
410
488
480
480
484
412
383
466
480
461
448
488
440
457
495
464
478
456
459
491
374
469
490
492
481
475
463
474
482
452
460
463
479
464
462
483
474
464
463
486
488
494
480
476
463
485
480
483
491
417
484
474
489
484
491
491
462
473
487
439


### Using HTMLHeaderTextSplitter

In [13]:
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

In [14]:
text_splitter_html = HTMLHeaderTextSplitter(headers_to_split_on)
texts_html = text_splitter_html.split_text(documents[0].page_content)

In [15]:
len(texts_html)

38

In [12]:
def isInfo(metadata):
    return len(metadata) != 0

In [17]:
refined_documents = []
for i in range(len(texts_html[:24])):
    if isInfo(texts_html[i].metadata):
        refined_documents.append(texts_html[i])
len(refined_documents)

22

In [22]:
for document in refined_documents:
    print(f"{document.metadata}, {document.page_content}")

{'Header 1': 'Furina/Lore'}, < Furina
{'Header 2': 'Personality[]'}, The absolute focus of the stage of judgment, until the final applause sounds.  
—In-game character attributes and profile page text  
Flamboyant and imprudent, Furina lives for the thrill of the courtroom, often speaking in a manner peppered with bravado and drama. She is impatient and has a childlike temper, and she will occasionally make judgments that she doesn't mean, which Neuvillette frequently has to control while conducting a trial to avoid complications. While she enjoys being in the spotlight, she only does so when it is focused at her positively, breaking down in complete shambles should something go out of plan and will try to save face at the first possible opportunity.  
It is later revealed that Furina was not the true Hydro Archon. Focalors, the successor of Egeria, had separated her divinity from her body and spirit; Furina was the Archon's body and spirit but had no powers. Focalors used Furina as a 

### Save into vectorstores/db_faiss

In [368]:
DB_FAISS_PATH = "vectorstores/db_faiss"

db = FAISS.from_documents(texts, embeddings)
db.save_local(DB_FAISS_PATH)

In [23]:
from langchain.document_loaders import AsyncHtmlLoader
from langchain.text_splitter import HTMLHeaderTextSplitter

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

def search(search_url):
    loader = AsyncHtmlLoader(search_url)
    documents = loader.load()
    text_splitter_html = HTMLHeaderTextSplitter(headers_to_split_on)
    texts_html = text_splitter_html.split_text(documents[0].page_content)
    refined_documents = []
    for i in range(len(texts_html)):
        if isInfo(texts_html[i].metadata):
            refined_documents.append(texts_html[i])
    return refined_documents

In [51]:
search_url = "https://genshin-impact.fandom.com/wiki/Furina"
search_url_lore =search_url + "/Lore"
search_url_companion = search_url + "/Companion"
urls = [search_url, search_url_lore, search_url_companion]

In [53]:
refined_documents = []
for url in urls:
    refined_documents += search(url)
len(refined_documents)

Fetching pages: 100%|###################################| 1/1 [00:00<00:00, 22.14it/s]
Fetching pages: 100%|###################################| 1/1 [00:00<00:00, 33.00it/s]
Fetching pages: 100%|###################################| 1/1 [00:00<00:00, 34.58it/s]


90