Credits dataset : https://github.com/CorwinFr/X3GPT_V2

## CONVERT TO MARKDOWN

In [None]:
!pip install -qU markitdown

In [59]:
!unzip -u '/content/knowledge-base-mini.zip'

Archive:  /content/knowledge-base-mini.zip
   creating: knowledge-base-mini/
  inflating: knowledge-base-mini/Sage X3 L4G.docx  
  inflating: knowledge-base-mini/X3_4GL_IMPORTANT_TIPS.pdf  
  inflating: knowledge-base-mini/X3_ATB0_Columns.txt  
  inflating: knowledge-base-mini/X3_ATB0_Index.txt  
  inflating: knowledge-base-mini/X3_ATB0_Tables.txt  
  inflating: knowledge-base-mini/X3_L4G_SHORT_DATABASE_MODEL.txt  
  inflating: knowledge-base-mini/X3_RELATIONSHIPS.txt  


In [6]:
# IMPORTS
from pathlib import Path
from markitdown import MarkItDown



In [60]:
FILES_PATH = Path(r"knowledge-base-mini/")
FILES_TYPE = ["*.pdf", "*.docx", "*.txt"]

files = list(Path(FILES_PATH).rglob("*.txt")) + list(Path(FILES_PATH).rglob("*.docx")) + list(Path(FILES_PATH).rglob("*.pdf"))

for file in files:
  print(file)

knowledge-base-mini/X3_RELATIONSHIPS.txt
knowledge-base-mini/X3_ATB0_Tables.txt
knowledge-base-mini/X3_ATB0_Columns.txt
knowledge-base-mini/X3_L4G_SHORT_DATABASE_MODEL.txt
knowledge-base-mini/X3_ATB0_Index.txt
knowledge-base-mini/Sage X3 L4G.docx
knowledge-base-mini/X3_4GL_IMPORTANT_TIPS.pdf


In [61]:
MD_PATH = Path('/content/knowledge-base-mini/markdown/')
MD_PATH.mkdir() if not MD_PATH.exists() else None

markitdown = MarkItDown()

for file in files:
    md_path = MD_PATH / f"{file.stem}.md"
    file = rf"{file}"
    result = markitdown.convert(file).text_content
    md_path.write_text(result, encoding="utf-8")

## POPULATE VECTOR DB

In [63]:
!pip install -qU langchain langchain-community langchain-huggingface faiss-cpu sentence-transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.2/412.2 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [64]:
# IMPORTS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document



In [67]:
# FUNCTION TO POPULATE DATABASE
headers_to_split_on = [
    ("#", "h1"),
    ("##", "h2"),
    ("###", "h3")
]

def populate_faiss_vectordb(db: FAISS, file: Path):

  # load document and split in chunks
  doc = TextLoader(file).load()
  md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
  docs = md_splitter.split_text(doc[0].page_content)

  # add metadata
  for d in docs:
    d.metadata["source"] = file.name #doc[0].metadata["source"]

  # populate db
  db.add_documents(docs)

In [69]:
# INSTANCIATE DB
MD_PATH = Path("/content/knowledge-base-mini/markdown")
embedding_model = HuggingFaceEmbeddings(model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
db = FAISS.from_texts(" ", embedding_model)

# INGEST DOCS
i = 0
for file in MD_PATH.rglob("*.md"):
  i += 1
  print(str(i) + " : " + str(file))
  populate_faiss_vectordb(db, file)

# PERSIST VECTOR DB
db.save_local("/content/vectorDB")
!zip -r /content/vectorDB.zip /content/vectorDB
from google.colab import files
files.download('/content/vectorDB.zip')

1 : /content/knowledge-base-mini/markdown/X3_RELATIONSHIPS.md
2 : /content/knowledge-base-mini/markdown/X3_L4G_SHORT_DATABASE_MODEL.md
3 : /content/knowledge-base-mini/markdown/X3_4GL_IMPORTANT_TIPS.md
4 : /content/knowledge-base-mini/markdown/X3_ATB0_Tables.md
5 : /content/knowledge-base-mini/markdown/X3_ATB0_Index.md
6 : /content/knowledge-base-mini/markdown/Sage X3 L4G.md
7 : /content/knowledge-base-mini/markdown/X3_ATB0_Columns.md
  adding: content/vectorDB/ (stored 0%)
  adding: content/vectorDB/index.pkl (deflated 80%)
  adding: content/vectorDB/index.faiss (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>