In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [2]:
from Indox import IndoxRetrievalAugmentation

IRA = IndoxRetrievalAugmentation()

In [3]:
IRA.config

{'clustering': {'dim': 10, 'threshold': 0.1},
 'embedding_model': 'sbert',
 'postgres': {'conn_string': 'postgresql+psycopg2://postgres:xxx@localhost:port/db_name'},
 'prompts': {'document_relevancy_prompt': "You are a grader assessing relevance of a retrieved document to a user question. If the document contains keywords related to the user question, grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals.\nGive a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.\nProvide the binary score as a JSON with a single key 'score' and no preamble or explanation.\nHere is the retrieved document:\n{document}\nHere is the user question:\n{question}",
  'summary_model': {'content': 'You are a helpful assistant. Give a detailed summary of the documentation provided'}},
 'qa_model': {'name': 'mistral', 'temperature': 9e-05},
 'splitter': 'semantic-text-splitter',
 'summary_model': {'max_tokens': 100,

In [None]:
IRA.initialize()

## chunking an unstructured/structured document without stopword removal
you just need to set remove_sword=False to remove stop-words for any document including structured or unstructured

In [4]:
html = "https://www.python.org/"
chunks = IRA.create_chunks(file_path=html, unstructured=True, content_type="html", remove_sword=False)

Starting processing...


2024-05-07 13:55:55,442 - INFO - Reading document from string ...
2024-05-07 13:55:55,447 - INFO - Reading document ...


End Chunking process.


In [5]:
[a.page_content for a in chunks]

['Notice: While JavaScript is not essential for this website, your interaction with the content will be limited. Please turn JavaScript on for the full experience.\n\nSkip to content\n\n▼ Close\n\nPython\n\nPSF\n\nDocs\n\nPyPI\n\nJobs\n\nCommunity\n\n▲ The Python Network\n\nDonate',
 '≡ Menu\n\nA A\n                                    \n                                        Smaller\n                                        Larger\n                                        Reset\n\nSocialize\n                                    \n                                        LinkedIn\n                                        Mastodon\n                                        Chat on IRC\n                                        Twitter',
 'About\n        \n            \n\n\n    \n        Applications\n    \n        Quotes\n    \n        Getting Started\n    \n        Help\n    \n        Python Brochure\n\nDownloads\n        \n            \n\n\n    \n        All releases\n    \n        Source code

## chunking an unstructured/structured document with stopword removal

In [6]:
chunks = IRA.create_chunks(file_path=html, unstructured=True, content_type="html", remove_sword=True)

Starting processing...


2024-05-07 13:56:55,040 - INFO - Reading document from string ...
2024-05-07 13:56:55,045 - INFO - Reading document ...


End Chunking process.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\

In [7]:
[a.page_content for a in chunks]

['Notice : JavaScript essential website , interaction content limited . Please turn JavaScript full experience . Skip content ▼ Close Python PSF Docs PyPI Jobs Community ▲ Python Network Donate',
 '≡ Menu Smaller Larger Reset Socialize LinkedIn Mastodon Chat IRC Twitter',
 'Applications Quotes Getting Started Help Python Brochure Downloads releases Source code Windows macOS Platforms License Alternative Implementations',
 "Documentation Docs Audio/Visual Talks Beginner 's Guide Developer 's Guide FAQ Non-English Docs PEP Index Python Books Python Essays",
 'Community Diversity Mailing Lists IRC Forums PSF Annual Impact Report Python Conferences Special Interest Groups Python Logo Python Wiki Code Conduct Community Awards Get Involved Shared Stories',
 'Success Stories Arts Business Education Engineering Government Scientific Software Development News Python News PSF Newsletter PSF News PyCon US News News Community',
 'Events Python Events User Group Events Python Events Archive User Gr