In [1]:
from glob import glob
import re
from langchain.text_splitter import HTMLHeaderTextSplitter

In [2]:
all_html_files = glob('netflex_data/*.html')
all_html_files[:5]

['netflex_data\\Accessibility_on_Netflix__Netflix_Help_Center.html',
 'netflex_data\\Accessing_and_updating_information_associated_with_your_account__Netflix_Help_Center.html',
 'netflex_data\\Account_Management,_Email_Communications_and_Privacy___Netflix_Help_Center.html',
 'netflex_data\\Ads_on_Netflix__Netflix_Help_Center.html',
 'netflex_data\\Arranger_-_Game_Support__Netflix_Help_Center.html']

In [3]:
len(all_html_files)

268

In [4]:
headers_to_split_on = [
    ("h1", "Heading 1"),
    ("h2", "Heading 2"),
    # ("p", "Paragraph"),
    # ("ul", "Unordered List"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [5]:
netflex_content = []

for html_file in all_html_files:
    with open(html_file, encoding='utf-8') as f:
        html_text = f.read()

    if not html_file or len(html_text) == 0:
        continue

    html_header_splits = html_splitter.split_text(html_text)

    if not html_header_splits or len(html_header_splits) == 0:
        continue

    # to remove css rules if founded

    page_content = html_header_splits[0].page_content

    cleaned_content = re.sub(r"label\[.*?\]:.*\n.*\n.*", "", page_content)

    cleaned_content = re.sub(r"@media print.*\n.*\n.*\n.*\n.*", "", cleaned_content)
    
    html_header_splits[0].page_content = cleaned_content

    netflex_content.append(html_header_splits)

In [6]:
netflex_content[:10]

[[Document(metadata={}, page_content="A country must be selected to view content in this article.  \nAt Netflix, we want everyone to enjoy great stories, no matter their language, device, internet connection, or abilities. That’s why we offer a variety of accessibility features guided by international accessibility standards and relevant legal frameworks.  \nWe offer a number of accessibility features, including support for assistive technologies such as screen readers and listening systems, use of built-in features on devices like Apple and Android, and our own tools, including the ability to customize subtitles and closed captions on TV and adjust playback speed on mobile.  \nIn addition to regularly monitoring and improving our accessibility features, we ask for feedback from members and accessibility and disability advocacy groups. Scroll to the bottom of this page to share your feedback.  \nThe following accessibility features are available for people with hearing, sight, or physi

In [7]:
docs = {
    "ids": [],
    "documents": [],
}

for i, page in enumerate(netflex_content):
    docs["ids"].append(str(i))
    docs["documents"].append(page[0].page_content)


In [None]:
import chromadb

In [9]:
client = chromadb.PersistentClient('./chroma_db')

In [10]:
id = 'a1fce6c3-b6db-4873-ac0c-8f99d4ea5741'
collection = client.get_or_create_collection('netflex_faqs')

In [11]:
collection.add(
    ids=docs["ids"],
    documents=docs["documents"]
)

In [12]:
results = collection.query(
    query_texts="how many devices allowed per account?",
    n_results=5,
)
results

{'ids': [['157', '106', '154', '193', '86']],
 'embeddings': None,
 'documents': [['The page shows you details about signed-in devices that have been recently active on the account. You can also use it to sign out of these devices.  \nManage Access and Devices  \nDevices may take up to 48 hours to appear on this page, not all devices signed into the account will be shown, and complete information may not be shown for all devices. For example, the page won’t show devices that haven’t been active in the last 90 days, have only been used to play Netflix Games, or have only been used to access Tudum.  \nInformation Shown  \nDevice: The devices that are signed in to use Netflix. A " label shows which device you’re currently viewing the page from. Icons show the type of device -- TV, computer, phone, game console, etc.  \nCurrent Device"  \nProfiles: The profile with the most recent viewing activity on that device. We won’t show a profile if there hasn’t been recent viewing activity on the d