In [21]:
import os
import pandas as pd
import numpy as np
from pypdf import PdfReader , PdfWriter
#from pypdf import 

from transformers import AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer

from pathlib import Path


from dataclasses import dataclass

from docx import Document as DocxDocument
from openpyxl import load_workbook
import json
from langchain_core.documents import Document

from tqdm import tqdm
import logging

from typing import List



In [22]:
@dataclass(frozen=True)
class config:
    files_dir: Path
    output_dir: Path
    max_tokens : int
    token_overlap : int
    tokenizer_name : str
    embedding_model :str
    llm_model :str

In [23]:
Path("./processed/").mkdir(parents = True, exist_ok = True)

In [24]:
files_dir = Path(".")

CORPUS_PATH = Path("./processed/corpus.jsonl")
CHUNKS_PATH = Path("./processed/chunks.jsonl")
EMBEDDINGS_PATH = Path("./processed/embeddings.npy")
META_PATH = Path("./processed/metadata.jsonl")
FAISS_INDEX_PATH = Path("./processed/faiss.index")



In [29]:
logger = logging.getLogger(__name__)


def load_documents(files_dir:Path)-> list[Document]:
    files_dir = Path(files_dir)

    if not files_dir.exists():
        raise FileNotFoundError("Folder not found..")
    
    docs : List[Document] = []

    files_required = sorted(
        p for p in files_dir.rglob("*") if p.is_file() and p.suffix.lower() in {".pdf",".docx",".xlsx"}
    )


    for path in files_required:
        ext = path.suffix.lower()

        try:
            if ext ==".pdf":
                docs.extend(_load_pdf(path))
            elif ext == ".docx":
                docs.extend(_load_docx(path))
            elif ext == ".xlsx":
                docs.extend(_load_xlsx(path))

        except Exception as e:
            logger.exception("Not loading %s, %s", path,e)

            continue

    return docs

In [30]:
def _load_pdf(path: Path) -> List[Document]:
    
    out: List[Document] = []
    reader = PdfReader(str(path))

    for i, page in enumerate(reader.pages, start=1):
        text = (page.extract_text() or "").strip()
        if not text:
            continue

        out.append(
            Document(
                page_content=text,
                metadata={
                    "source": str(path),
                    "filename": path.name,
                    "filetype": "pdf",
                    "page": i,
                },
            )
        )
    return out


def _load_docx(path: Path) -> List[Document]:
    out: List[Document] = []
    doc = DocxDocument(str(path))

    # paragraphs
    for pi, para in enumerate(doc.paragraphs, start=1):
        text = (para.text or "").strip()
        if not text:
            continue
        out.append(
            Document(
                page_content=text,
                metadata={
                    "source": str(path),
                    "filename": path.name,
                    "filetype": "docx",
                    "block": f"paragraph:{pi}",
                },
            )
        )

    # tables
    for ti, table in enumerate(doc.tables, start=1):
        rows_text: List[str] = []
        for row in table.rows:
            cells = [ (cell.text or "").strip() for cell in row.cells ]
            # drop fully empty rows
            if any(cells):
                rows_text.append("\t".join(cells))

        table_text = "\n".join(rows_text).strip()
        if table_text:
            out.append(
                Document(
                    page_content=table_text,
                    metadata={
                        "source": str(path),
                        "filename": path.name,
                        "filetype": "docx",
                        "block": f"table:{ti}",
                    },
                )
            )

    return out


def _load_xlsx(path: Path) -> List[Document]:
    out: List[Document] = []

    wb = load_workbook(filename=str(path), read_only=True, data_only=True)

    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]

        lines: List[str] = []
        for row in ws.iter_rows(values_only=True):
            # Convert row to strings, drop empty cells
            row_vals = [("" if c is None else str(c)).strip() for c in row]

            # skip fully empty rows
            if not any(row_vals):
                continue

            lines.append("\t".join(row_vals))

        sheet_text = "\n".join(lines).strip()
        if sheet_text:
            out.append(
                Document(
                    page_content=sheet_text,
                    metadata={
                        "source": str(path),
                        "filename": path.name,
                        "filetype": "xlsx",
                        "sheet": sheet_name,
                    },
                )
            )

    return out


In [31]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
)

raw_docs = load_documents(files_dir)
chunks = text_splitter.split_documents(raw_docs)

print(f"Loaded {len(raw_docs)} docs -> {len(chunks)} chunks")
print(chunks[0].metadata)
#print(chunks[0].page_content[:300])


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)


Loaded 137 docs -> 240 chunks
{'source': 'Attention_is_all_you_need (1) (3) (1).pdf', 'filename': 'Attention_is_all_you_need (1) (3) (1).pdf', 'filetype': 'pdf', 'page': 1}


In [39]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="chroma_db",
)
vectordb.persist()


  vectordb.persist()


In [40]:
# pip install sentence-transformers
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="chroma_db",
)
vectordb.persist()


In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

results = retriever.invoke("What is the refund policy?")
for r in results:
    print(r.metadata)
    print(r.page_content[:200])
    print("-----")

{'block': 'paragraph:87', 'source': 'EU AI Act Doc (1) (3) (1).docx', 'filename': 'EU AI Act Doc (1) (3) (1).docx', 'filetype': 'docx'}
Downstream providers can lodge a complaint regarding the upstream providers infringement to the AI Office.
-----
{'block': 'paragraph:87', 'filename': 'EU AI Act Doc (1) (3) (1).docx', 'source': 'EU AI Act Doc (1) (3) (1).docx', 'filetype': 'docx'}
Downstream providers can lodge a complaint regarding the upstream providers infringement to the AI Office.
-----
{'block': 'paragraph:70', 'filetype': 'docx', 'source': 'EU AI Act Doc (1) (3) (1).docx', 'filename': 'EU AI Act Doc (1) (3) (1).docx'}
Establish a policy to respect the Copyright Directive.
-----
{'block': 'paragraph:70', 'source': 'EU AI Act Doc (1) (3) (1).docx', 'filename': 'EU AI Act Doc (1) (3) (1).docx', 'filetype': 'docx'}
Establish a policy to respect the Copyright Directive.
-----
{'filetype': 'docx', 'source': 'EU AI Act Doc (1) (3) (1).docx', 'filename': 'EU AI Act Doc (1) (3) (1).docx

ModuleNotFoundError: No module named 'langchain.chains'

[31mERROR: Could not find a version that satisfies the requirement langchain.chains (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for langchain.chains[0m[31m
[0m