In [1]:
import pandas as pd
import numpy as np
import re
import faiss
import gradio as gr
import hf_xet
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("contracts.csv")

In [3]:
def normalize_arabic(text):
    text = str(text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'[ًٌٍَُِّْ]', '', text)  
    text = text.strip()
    return text

In [4]:
def extract_keywords(query):
    stopwords = ["من", "على", "في", "و", "إلى", "عن", "كل", "الشركة", "اعطيني", "اريد", "اعطني"]
    words = normalize_arabic(query).split()
    keywords = [w for w in words if w not in stopwords and len(w) > 2]
    return " ".join(keywords)

In [5]:
df["semantic_text"] = (
    df["company"].astype(str) + " " +
    df["details"].astype(str) + " " +
    df["amount"].astype(str) + " " +
    df["date"].astype(str)
)
df["semantic_text_norm"] = df["semantic_text"].apply(normalize_arabic)

In [6]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-arabic-7b-instruct")
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-arabic-7b-instruct")

OSError: tiiuae/falcon-arabic-7b-instruct is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [7]:
embeddings = model.encode(df["semantic_text_norm"].tolist(), convert_to_numpy=True)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [8]:
def parse_query_simulated(query):
    filters = {}

    query_norm = normalize_arabic(query)

    company_match = re.search(r"الشركة\s+(\w+)", query_norm)
    if company_match:
        filters["company"] = company_match.group(1)

    amount_min = re.search(r"(فوق|أكبر من|أعلى من)\s*(\d+)", query_norm)
    amount_max = re.search(r"(تحت|أقل من|أدنى من)\s*(\d+)", query_norm)
    if amount_min:
        filters["amount_min"] = int(amount_min.group(2))
    if amount_max:
        filters["amount_max"] = int(amount_max.group(2))

    year_min = re.search(r"(من|منذ)\s*(\d{4})", query_norm)
    year_max = re.search(r"(حتى|إلى)\s*(\d{4})", query_norm)
    if year_min:
        filters["year_min"] = int(year_min.group(2))
    if year_max:
        filters["year_max"] = int(year_max.group(2))

    keywords = extract_keywords(query)
    filters["details_contains"] = keywords

    return filters

In [9]:
def fuzzy_filter(df, column, query, threshold=70):
    if not query:
        return df
    matches = df[column].apply(lambda x: fuzz.partial_ratio(normalize_arabic(str(x)), normalize_arabic(query)))
    return df[matches >= threshold]

In [10]:
def simulated_hybrid_search(query):
    filters = parse_query_simulated(query)
    filtered_df = df.copy()

    # Structured 
    if filters.get("company"):
        filtered_df = fuzzy_filter(filtered_df, "company", filters["company"], threshold=60)
    if filters.get("amount_min"):
        filtered_df = filtered_df[filtered_df["amount"] > filters["amount_min"]]
    if filters.get("amount_max"):
        filtered_df = filtered_df[filtered_df["amount"] < filters["amount_max"]]
    if filters.get("year_min"):
        filtered_df = filtered_df[pd.to_datetime(filtered_df["date"]).dt.year >= filters["year_min"]]
    if filters.get("year_max"):
        filtered_df = filtered_df[pd.to_datetime(filtered_df["date"]).dt.year <= filters["year_max"]]

    # Semantic
    if filters.get("details_contains"):
        query_embedding = model.encode([filters["details_contains"]], convert_to_numpy=True)
        distances, indices = index.search(query_embedding, k=min(5, len(df)))
        semantic_results = df.iloc[indices[0]]
        filtered_df = pd.merge(filtered_df, semantic_results, how="inner")

    return filtered_df

In [11]:
def parse_filters(query):
    filters = {}
    query_norm = normalize_arabic(query)

    company_match = re.search(r"الشركة\s+(\w+)", query_norm)
    if company_match:
        filters["company"] = company_match.group(1)

    amount_min = re.search(r"(فوق|أكبر من|أعلى من|أكثر من|أزيد من|على الاقل|لا يقل عن)\s*(\d+)", query_norm)
    amount_max = re.search(r"(تحت|أقل من|أدنى من|لا يزيد عن|اقل او يساوي)\s*(\d+)", query_norm)
    if amount_min:
        filters["amount_min"] = int(amount_min.group(2))
    if amount_max:
        filters["amount_max"] = int(amount_max.group(2))

    year_min = re.search(r"(من|منذ|ابتداءً من|بدءاً من|قل من|اعتبارا من|اعتبارا من تاريخ)\s*(\d{4})", query_norm)
    year_max = re.search(r"(حتى|إلى|إلى غاية|إلى و بما في ذلك|لغاية|حد)\s*(\d{4})", query_norm)
    if year_min:
        filters["year_min"] = int(year_min.group(2))
    if year_max:
        filters["year_max"] = int(year_max.group(2))

    # Range pattern: "بين X و Y"
    range_match = re.search(r"بين\s*(\d{4}|\d+)\s*و\s*(\d{4}|\d+)", query_norm)
    if range_match:
        start, end = range_match.groups()
        if len(start) == 4:
            filters["year_min"] = int(start)
            filters["year_max"] = int(end)
        else:
            filters["amount_min"] = int(start)
            filters["amount_max"] = int(end)

    return filters


In [12]:
def hybrid_search(query, top_k=10, boost_structured=True):
    query_norm = normalize_arabic(query)
    filters = parse_filters(query)

    query_embedding = model.encode([query_norm], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k=len(df))
    ranked_df = df.iloc[indices[0]].copy()
    scores = 1 / (1 + distances[0])  

    ranked_df["score"] = scores

    if boost_structured:
        boost = np.zeros(len(ranked_df))
        if filters.get("company"):
            boost += ranked_df["company"].apply(lambda x: fuzz.partial_ratio(normalize_arabic(x), filters["company"])/100)
        if filters.get("amount_min"):
            boost += (ranked_df["amount"] > filters["amount_min"]).astype(float) * 0.2
        if filters.get("amount_max"):
            boost += (ranked_df["amount"] < filters["amount_max"]).astype(float) * 0.2
        if filters.get("year_min"):
            boost += (pd.to_datetime(ranked_df["date"]).dt.year >= filters["year_min"]).astype(float) * 0.2
        if filters.get("year_max"):
            boost += (pd.to_datetime(ranked_df["date"]).dt.year <= filters["year_max"]).astype(float) * 0.2
        ranked_df["score"] += boost

    # Sort by final score
    ranked_df = ranked_df.sort_values(by="score", ascending=False)

    return ranked_df.head(top_k).drop(columns=["semantic_text", "semantic_text_norm", "score"])

In [13]:
def gradio_search(query):
    results_df = hybrid_search(query)
    return results_df

In [14]:
iface = gr.Interface(
    fn=simulated_hybrid_search,
    inputs=gr.Textbox(label="اكتب استفسارك هنا"),
    outputs=gr.Dataframe(label="النتائج"),
    title="بحث عقود عربي (محلي، مع تصحيح الأخطاء)",
    description="ابحث باستخدام اللغة الطبيعية، يشمل الشركات، القيم، السنوات، ونوع العقود. يدعم الأخطاء الإملائية."
)

iface.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


