In [1]:
import pandas as pd
import gradio as gr
import torch
import accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM
from mistralai import Mistral

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = Mistral(api_key="DNN3lk0n8cBucYBe0s38s6gCIw4X6zmP")

In [3]:
df = pd.DataFrame([
    {"company": "X", "amount": 1200, "date": "2023-05-10", "details": "Contract for marketing"},
    {"company": "Y", "amount": 500, "date": "2022-11-20", "details": "Contract for supply"},
    {"company": "X", "amount": 800, "date": "2023-01-15", "details": "Contract for consulting"},
    {"company": "Z", "amount": 2500, "date": "2023-06-01", "details": "IT infrastructure project"},
    {"company": "A", "amount": 1500, "date": "2021-12-30", "details": "Office renovation contract"},
    {"company": "B", "amount": 3000, "date": "2023-03-22", "details": "Marketing campaign"},
    {"company": "C", "amount": 750, "date": "2022-08-10", "details": "Supply of furniture"},
    {"company": "X", "amount": 2000, "date": "2023-07-15", "details": "Consulting project"},
    {"company": "Y", "amount": 1800, "date": "2023-01-05", "details": "Software license"},
    {"company": "Z", "amount": 400, "date": "2022-05-17", "details": "Maintenance contract"},
    {"company": "A", "amount": 2200, "date": "2023-04-10", "details": "Training program"},
    {"company": "B", "amount": 950, "date": "2021-11-01", "details": "Marketing research"},
    {"company": "C", "amount": 1300, "date": "2023-02-25", "details": "Office furniture supply"},
    {"company": "X", "amount": 5000, "date": "2023-08-01", "details": "Major consulting project"},
    {"company": "Y", "amount": 1250, "date": "2022-09-30", "details": "Software maintenance"},
])


In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM

# model_path = "./bloomz-1b1-local"

# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
# model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
# device = torch.device("cpu")



In [None]:
# save_path = "./flan-t5-local"
# tokenizer.save_pretrained(save_path)
# model.save_pretrained(save_path)

In [6]:
import re, json
def clean_json_string(s: str) -> str:
    s = s.strip()

    if not s.startswith("{") and "company" in s:
        s = "{" + s
    if not s.endswith("}"):
        s = s + "}"

    s = re.sub(r",\s*}", "}", s)
    s = re.sub(r"\"\s*}", "}", s)
    return s


In [None]:
import json
from mistralai import Mistral

USE_MISTRAL = True 

if USE_MISTRAL:
    client = Mistral(api_key="token")


def query_to_filters(query: str, max_new_tokens: int = 150):
    """
    Convert English or Arabic query about contracts into a structured filter dict:
    { company, amount_min, amount_max, year_min, year_max, keywords }
    """

    prompt = f"""
Convert this query about contracts into a JSON object with keys:
company, amount_min, amount_max, year_min, year_max, keywords.
Use null if not specified. Respond ONLY with valid JSON.
- The 'keywords' field should always be in English (the language of the CSV), 
even if the input query is in Arabic.

English examples:
"Company X contracts" => {{"company":"X","amount_min":null,"amount_max":null,"year_min":null,"year_max":null, "keywords":""}}
"Contracts about office renovation" => {{"company":null,"amount_min":null,"amount_max":null,"year_min":null,"year_max":null, "keywords":"office renovation"}}

Arabic examples:
"عقود من الشركة X" => {{"company":"X","amount_min":null,"amount_max":null,"year_min":null,"year_max":null, "keywords":""}}
"عقود عن تجديد المكتب" => {{"company":null,"amount_min":null,"amount_max":null,"year_min":null,"year_max":null, "keywords":"office renovation"}}

Query: "{query}"
JSON:
"""

    if USE_MISTRAL:
        response = client.chat.complete(
            model="mistral-small-latest",  
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        raw_output = response.choices[0].message.content

    try:
        start = raw_output.find("{")
        end = raw_output.rfind("}") + 1
        json_str = raw_output[start:end]
        filters = json.loads(json_str)
        
        for key in ["company", "amount_min", "amount_max", "year_min", "year_max", "keywords"]:
            if key not in filters:
                filters[key] = None

    except Exception as e:
        print("!!! Fallback triggered: returning empty filters")
        print("Raw model response:", raw_output)
        filters = {
            "company": None,
            "amount_min": None,
            "amount_max": None,
            "year_min": None,
            "year_max": None,
            "keywords": None,
        }

    return filters


In [8]:
import arabic_reshaper
from bidi.algorithm import get_display
import re

def normalize_arabic(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا").replace("ى", "ي").strip()
    reshaped_text = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text


In [9]:
def search_csv(query):
    filters = query_to_filters(query)
    print("Structured query:", filters)

    results = df.copy()

    if filters.get("company"):
        results = results[results["company"] == filters["company"]]
    if filters.get("amount_min"):
        results = results[results["amount"] >= filters["amount_min"]]
    if filters.get("amount_max"):
        results = results[results["amount"] <= filters["amount_max"]]
    if filters.get("year_min"):
        results = results[pd.to_datetime(results["date"]).dt.year >= filters["year_min"]]
    if filters.get("year_max"):
        results = results[pd.to_datetime(results["date"]).dt.year <= filters["year_max"]]

    if filters.get("keywords"):
        keywords = filters["keywords"].strip()
        if keywords:
            pattern_normalized = normalize_arabic(keywords)
            results["details_normalized"] = results["details"].apply(normalize_arabic)
            results = results[results["details_normalized"].str.contains(pattern_normalized, case=False, regex=True)]

    return results


In [None]:
iface = gr.Interface(
    fn=search_csv,
    inputs=gr.Textbox(label="Enter your Arabic query"),
    outputs=gr.Dataframe(label="Results"),
    title="Arabic Contract Search",
    description="Type any Arabic query about contracts. The model will understand and filter the CSV."
)

iface.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Structured query: {'company': None, 'amount_min': None, 'amount_max': None, 'year_min': None, 'year_max': None, 'keywords': 'marketing'}
Structured query: {'company': None, 'amount_min': None, 'amount_max': None, 'year_min': None, 'year_max': None, 'keywords': ''}
