In [165]:
import pandas as pd
import gradio as gr
import torch
import accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM
import os 
from mistralai import Mistral
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from vectordb import VectorDB
import faiss

In [143]:
from config import API_KEY

In [144]:
USE_MISTRAL = True

In [145]:
client = Mistral(api_key=API_KEY)


In [146]:
file_path = "document_metadata.csv"

df = pd.read_csv(file_path)

In [148]:
import json

USE_MISTRAL = True 


def query_to_filters(query: str, max_new_tokens: int = 150):
    """
    Convert English or Arabic query about contracts into a structured filter dict:
    { company, amount_min, amount_max, year_min, year_max, keywords }
    """

    prompt = f"""
Convert this query about documents into a JSON object with keys:
author, category, year_min, year_max, tags, keywords.
Use null if not specified. Respond ONLY with valid JSON.
- The 'keywords' field should always be in English (the language of the CSV),
even if the input query is in Arabic.
- If multiple tags are mentioned, return them as a list.
- If a date range is mentioned, fill year_min and year_max.
- If only a single year is mentioned, set both year_min and year_max to that year.
- If a field is not mentioned, set it to null.

English examples:
"Reports by John Smith" => {{"author":"John Smith","category":"Report","year_min":null,"year_max":null,"tags":null,"keywords":null}}
"Documents about financial performance in 2023" => {{"author":null,"category":null,"year_min":2023,"year_max":2023,"tags":null,"keywords":"financial performance"}}
"Policies by HR with tag onboarding between 2021 and 2022" => {{"author":"HR","category":"Policy","year_min":2021,"year_max":2022,"tags":["onboarding"],"keywords":null}}
"Documents tagged marketing and sales in 2023" => {{"author":null,"category":null,"year_min":2023,"year_max":2023,"tags":["marketing","sales"],"keywords":null}}

Arabic examples:
"تقارير من جون سميث" => {{"author":"John Smith","category":"Report","year_min":null,"year_max":null,"tags":null,"keywords":null}}
"مستندات عن الأداء المالي في ٢٠٢٣" => {{"author":null,"category":null,"year_min":2023,"year_max":2023,"tags":null,"keywords":"financial performance"}}
"سياسات من قسم الموارد البشرية مع علامات onboarding بين 2021 و 2022" => {{"author":"HR","category":"Policy","year_min":2021,"year_max":2022,"tags":["onboarding"],"keywords":null}}
"مستندات عن التسويق والمبيعات في 2023" => {{"author":null,"category":null,"year_min":2023,"year_max":2023,"tags":["marketing","sales"],"keywords":null}}

Query: "{query}"
JSON:
"""


    if USE_MISTRAL:
        response = client.chat.complete(
            model="ministral-3b-latest",  
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        raw_output = response.choices[0].message.content

    try:
        start = raw_output.find("{")
        end = raw_output.rfind("}") + 1
        json_str = raw_output[start:end]
        filters = json.loads(json_str)
        
        for key in [ "document_id","title","author","created_date","last_modified","category","tags","content"]:
            if key not in filters:
                filters[key] = None

    except Exception as e:
        print("!!! Fallback triggered: returning empty filters")
        print("Raw model response:", raw_output)
        filters = {
            "author": None,
            "category": None,
            "year_min": None,
            "year_max": None,
            "tags": None,
            "keywords": None
        }

    return filters


In [149]:
import arabic_reshaper
from bidi.algorithm import get_display
import re

def normalize_arabic(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا").replace("ى", "ي").strip()
    reshaped_text = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text


In [150]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-MiniLM-L6-cos-v5')
model.save('models/msmarco-MiniLM-L6-cos-v5')

In [None]:
# def compute_embeddings(df, embed_client):
#     embeddings = []
#     for i, row in df.iterrows():
#         text = f"{row['title']} {row['tags']} {row['content']}"
#         response = embed_client.embeddings.create(
#             model="mistral-embed",
#             inputs=[text]
#         )
#         embeddings.append(response.data[0].embedding)
#     df["embedding"] = embeddings
#     return df


In [151]:
def compute_embeddings(df, model):
    embeddings = []
    for i, row in df.iterrows():
        text = f"{row['title']} {row['tags']} {row['content']}"
        emb = model.encode(text, convert_to_numpy=True)  
        embeddings.append(emb)
    
    df["embedding"] = embeddings
    return df

In [152]:
df = compute_embeddings(df, model)

In [153]:
embedding_matrix = np.array(df["embedding"].to_list()).astype("float32")

print(embedding_matrix.shape)

(75, 384)


In [159]:
vdb = VectorDB(dim=df["embedding"][0].__len__(),  
               index_path="vector_index.faiss",
               meta_path="vector_metadata.csv")

In [164]:
vdb.build(df, embedding_col="embedding")

vdb.save()

VectorDB saved: vector_index.faiss + vector_metadata.csv


In [136]:
def semantic_search(query, embed_client, df, index, top_k=5):
    response = embed_client.embeddings.create(
        model="mistral-embed",
        inputs=[query]
    )
    query_vec = np.array(response.data[0].embedding).astype("float32").reshape(1, -1)

    distances, indices = index.search(query_vec, top_k)

    sims = 1 / (1 + distances[0])

    results = df.iloc[indices[0]].copy()
    results["similarity"] = sims
    
    return results[["title", "author", "category", "tags", "similarity"]]


In [None]:
def semantic_search_faiss(query, vdb, top_k=5):
    query_vec = model.encode([query], convert_to_numpy=True).astype("float32")
    results = vdb.search(query_vec, top_k=top_k)
    return results[["title", "author", "category", "tags", "similarity"]]

In [162]:
def search_csv(query):
    filters = query_to_filters(query)
    print("Structured query:", filters)

    results = df.copy()

    if filters.get("author"):
        results = results[results["author"].str.contains(filters["author"], case=False, na=False)]
    if filters.get("category"):
        results = results[results["category"].str.contains(filters["category"], case=False, na=False)]
    if filters.get("year_min"):
        results = results[pd.to_datetime(results["created_date"]).dt.year >= filters["year_min"]]
    if filters.get("year_max"):
        results = results[pd.to_datetime(results["created_date"]).dt.year <= filters["year_max"]]
    if filters.get("tags"):
        tag_pattern = "|".join(filters["tags"])
        results = results[results["tags"].str.contains(tag_pattern, case=False, na=False)]
    if filters.get("keywords"):
        keyword_text = filters["keywords"].strip()
        if keyword_text:
            words = re.split(r"\s+", keyword_text)
            words_normalized = [normalize_arabic(w) for w in words]

            results["content_normalized"] = results["content"].apply(normalize_arabic)

            pattern = "|".join(re.escape(w) for w in words_normalized)
            results = results[results["content_normalized"].str.contains(pattern, case=False, regex=True)]


    return results


In [166]:
def semantic_search_gradio(query):
    return semantic_search_faiss(query, vdb=vdb, top_k=5)

In [None]:
with gr.Blocks() as iface:
    gr.Markdown("# Arabic Contract Search")
    gr.Markdown("Choose structured or semantic search")
    
    with gr.Tab("Structured Search"):
        query_input = gr.Textbox(label="Enter your Arabic query")
        output_table = gr.Dataframe(label="Results")
        search_btn = gr.Button("Search (Structured)")
        search_btn.click(search_csv, inputs=query_input, outputs=output_table)

    with gr.Tab("Semantic Search"):
        sem_query_input = gr.Textbox(label="Enter your query for semantic search")
        sem_output_table = gr.Dataframe(label="Results")
        sem_search_btn = gr.Button("Search (Semantic)")

        sem_search_btn.click(semantic_search_gradio, inputs=sem_query_input, outputs=sem_output_table)


iface.launch()

* Running on local URL:  http://127.0.0.1:7872
* To create a public link, set `share=True` in `launch()`.


