In [1]:
import pandas as pd
import gradio as gr
import torch
import accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM
import os 
from mistralai import Mistral
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import faiss
import weaviate
from weaviatedb import WeaviateDB
import tqdm
from DMSIndexer import DMSIndexer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
weaviate_client = weaviate.Client(url="http://localhost:8080")

In [3]:
from config import API_KEY

In [4]:
USE_MISTRAL = True

In [5]:
mistral_client = Mistral(api_key=API_KEY)

In [6]:
import json

USE_MISTRAL = True

def query_to_filters(query: str):
    """
    Extract search filters from a natural language query (English or Arabic).
    Returns a dict with possible fields to filter on and keywords for vector search.
    """
    
    prompt = f"""
Convert this query into a JSON object containing relevant search fields.
Return only valid JSON, use null for fields not mentioned.

Fields to extract:
- document_id, title, author, repo_name, folder_name
- tags, category, content, files_content
- created_date, last_modified, keywords

'keywords' must always be in English, even if the input is in Arabic.
Query: "{query}"
JSON:
"""

    if USE_MISTRAL:
        response = mistral_client.chat.complete(
            model="ministral-3b-latest",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        raw_output = response.choices[0].message.content

    try:
        start = raw_output.find("{")
        end = raw_output.rfind("}") + 1
        json_str = raw_output[start:end]
        filters = json.loads(json_str)

        for key in [
            "document_id","title","author","repo_name","folder_name",
            "tags","category","content","files_content",
            "created_date","last_modified","keywords"
        ]:
            if key not in filters:
                filters[key] = None

    except Exception as e:
        print("!!! Fallback triggered: returning empty filters")
        print("Raw model response:", raw_output)
        filters = {key: None for key in [
            "document_id","title","author","repo_name","folder_name",
            "tags","category","content","files_content",
            "created_date","last_modified","keywords"
        ]}

    return filters


In [7]:
import arabic_reshaper
from bidi.algorithm import get_display
import re

def normalize_arabic(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا").replace("ى", "ي").strip()
    reshaped_text = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text


In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-MiniLM-L6-cos-v5')
model.save('models/msmarco-MiniLM-L6-cos-v5')

In [9]:
indexer = DMSIndexer(weaviate_client, model)

indexer.import_from_json("dms_tree.json")

indexer.flatten_dms()

indexer.compute_embeddings()

indexer.insert_documents()

results = indexer.search("Damascus", top_k=10)
print(results)


Computed embedding for row 0, document_id=DOC001, length=384
Computed embedding for row 1, document_id=DOC002, length=384
Computed embedding for row 2, document_id=DOC001, length=384
Computed embedding for row 3, document_id=DOC002, length=384
Computed embedding for row 4, document_id=DOC003, length=384
Computed embedding for row 5, document_id=DOC004, length=384
Computed embedding for row 6, document_id=DOC001, length=384
Computed embedding for row 7, document_id=DOC002, length=384
Computed embedding for row 8, document_id=DOC001, length=384
Computed embedding for row 9, document_id=DOC002, length=384
Computed embedding for row 10, document_id=DOC003, length=384
Computed embedding for row 11, document_id=DOC004, length=384
Computed embedding for row 12, document_id=DOC001, length=384
Computed embedding for row 13, document_id=DOC002, length=384
Computed embedding for row 14, document_id=DOC001, length=384
Computed embedding for row 15, document_id=DOC002, length=384
Computed embedding

In [10]:
indexer.import_from_json("dms_tree.json")
print(type(indexer.dms_tree))
print(indexer.dms_tree[:3]) 


<class 'list'>
[{'repo_id': 'R001', 'repo_name': 'CarsRepo', 'folders': [{'folder_id': 'F001', 'folder_name': '2023', 'folders': [{'folder_id': 'F002', 'folder_name': 'Damascus', 'documents': [{'document_id': 'DOC001', 'title': 'Accident Report March', 'author': 'John Smith', 'created_date': '2023-03-15', 'last_modified': '2023-03-20', 'tags': ['accident', 'damascus', 'car'], 'category': 'Report', 'files': [{'file_id': 'FILE001', 'file_name': 'accident_summary.txt', 'file_type': 'text', 'content': 'Car X had an accident in Damascus on March 14, 2023.'}, {'file_id': 'FILE002', 'file_name': 'photo1.jpg', 'file_type': 'image', 'content': None}]}, {'document_id': 'DOC002', 'title': 'Insurance Claim', 'author': 'Jane Doe', 'created_date': '2023-02-10', 'last_modified': '2023-02-15', 'tags': ['insurance', 'damascus'], 'category': 'Claim', 'files': [{'file_id': 'FILE003', 'file_name': 'claim_form.pdf', 'file_type': 'pdf', 'content': None}]}]}, {'folder_id': 'F003', 'folder_name': 'Beirut', 'd

In [11]:
print(weaviate_client.schema.get())

result = weaviate_client.query.get("Document", ["title", "content", "tags"]).do()
print(result)

{'classes': [{'class': 'Document', 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2}, 'cleanupIntervalSeconds': 60, 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}, 'usingBlockMaxWAND': True}, 'multiTenancyConfig': {'autoTenantActivation': False, 'autoTenantCreation': False, 'enabled': False}, 'properties': [{'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'indexSearchable': True, 'name': 'document_id', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'indexSearchable': True, 'name': 'repo_id', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'indexSearchable': True, 'name': 'repo_name', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'indexSearchable': True, 'name': 'folder_id', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'i

In [12]:
def gradio_search(query, top_k=5):
    hits = indexer.search(query, top_k=top_k, certainty=0.01)
    
    if not hits:
        return "No results found."

    formatted = ""
    for i, hit in enumerate(hits, 1):
        doc_id = hit.get("document_id")
        distance = hit.get("distance")
        
        meta_row = indexer.flattened_df.loc[indexer.flattened_df['document_id'] == doc_id].to_dict(orient="records")[0]

        formatted += f"### Result {i}\n"
        formatted += f"- **Document ID**: {doc_id}\n"
        formatted += f"- **Title**: {meta_row.get('title')}\n"
        formatted += f"- **Repo**: {meta_row.get('repo_name')}\n"
        formatted += f"- **Folder**: {meta_row.get('folder_name')}\n"
        formatted += f"- **Distance**: {distance}\n"
        formatted += "\n"

    return formatted

iface = gr.Interface(
    fn=gradio_search,
    inputs=[
        gr.Textbox(label="Search Query"),
        gr.Slider(minimum=1, maximum=20, step=1, label="Top K Results", value=5)
    ],
    outputs="markdown",
    title="DMS Search",
    description="Search through repositories, folders, documents, and files."
)

iface.launch()

  from websockets.server import WebSocketServerProtocol


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




In [13]:
print(weaviate_client.schema.get("Document"))


{'class': 'Document', 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2}, 'cleanupIntervalSeconds': 60, 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}, 'usingBlockMaxWAND': True}, 'multiTenancyConfig': {'autoTenantActivation': False, 'autoTenantCreation': False, 'enabled': False}, 'properties': [{'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'indexSearchable': True, 'name': 'document_id', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'indexSearchable': True, 'name': 'repo_id', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'indexSearchable': True, 'name': 'repo_name', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'indexSearchable': True, 'name': 'folder_id', 'tokenization': 'word'}, {'dataType': ['text'], 'indexFilterable': True, 'indexRangeFilters': False, 'indexSearchabl