In [5]:
# Import libraries
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tokenizers.decoders import WordPiece

# Load model
model_name = "avichr/heBERT_NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

oracle = pipeline('ner', model='dicta-il/dictabert-ner', aggregation_strategy='simple')
oracle.tokenizer.backend_tokenizer.decoder = WordPiece()

# Load uploaded file
df = pd.read_csv("/content/true_positive_label_no.csv")


Device set to use cuda:0
Device set to use cuda:0


In [6]:
# Define intent templates
intent_templates = {
    "transport_query": "את יכולה למצוא לי מסלול {origin} {destination} ב{mode}?",
    "alarm_set": "את יכולה לכוון שעון מעורר לשעה {time} ב{period}?",
    "call_contact": "את יכולה לחייג {contact_name}?",
    "send_message": "את יכולה לשלוח הודעה {message} {contact_name}?",
    "calendar_set": "את יכולה ליצור לי פגישה ביומן בתאריך {date} לשעה {time}?",
    "camera_query": "אני רוצה לצלם {type}",
    "lists_createoradd": "תיצרי לי רשימה שיהיה בה {items}",
    "weather_query": "מה מזג האוויר {date} {location}?",
    "iot_wemo_on": "אני רוצה להדליק {device}",
    "query": "את יכולה לבדוק ולהגיד לי {search string}?"
}

# Extract keys from templates
intent_keys = {
    intent: re.findall(r"{(.*?)}", template)
    for intent, template in intent_templates.items()
}

# Helper functions
def extract_entities(ner_result, label):
    return [entity["word"] for entity in ner_result if label in entity["entity_group"]]

def extract_entity(ner_result, label):
    entities = extract_entities(ner_result, label)
    return entities[0] if entities else None


In [8]:
from google.colab import files

def extract_destination_fallback(transcript):
    words = transcript.split()
    ignored_starters = {"להגיע", "ללכת", "לנסוע", "לעלות", "לרדת", "לנסיעה"}
    stopwords = {
        "ש", "אם", "אז", "אבל", "ואם", "או", "על", "של", "עם", "שבו", "ב",
        "האם", "איך", "תגידי", "תגיד", "תגידו", "תוכלי", "תוכל", "לעזור", "לי", "בבקשה", "אתה", "אני", "אולי"
    }

    for i, word in enumerate(words):
        # התחלה תקינה ב-"ל", לא פועל
        if word.startswith("ל") and word not in ignored_starters and len(word) > 2:
            dest_words = [word]
            number_found = False
            for next_word in words[i+1:]:
                # עצור אם זו מילת עצירה (גם אם לא היה מספר)
                if next_word in stopwords:
                    break
                # תוודא שהמספר (למשל 45) ייכנס גם אם אחריו עיר
                dest_words.append(next_word)
                if re.match(r"^\d+$", next_word):
                    number_found = True
            return " ".join(dest_words)
    return ""





def build_action_json(transcript, ner_result, intent):
    intent = intent.strip()
    keys = intent_keys.get(intent, [])
    result = {"intent": intent}

    # Extract entities
    full_name = extract_entity(ner_result, "PER")
    location = extract_entity(ner_result, "LOC")
    date = extract_entity(ner_result, "DATE")
    time = extract_entity(ner_result, "TIME")
    locs = extract_entities(ner_result, "LOC")
    msg = transcript.split(":")[-1] if ":" in transcript else transcript.split(full_name)[-1].strip() if full_name and full_name in transcript else None

    for key in keys:
        if intent == "transport_query":
            result["mode"] = "רכב"
            result["destination"] = ""
            result["origin"] = "מהמיקום שלי"

            modes = {
                ("רכבת", "אוטובוס"): "תחבורה ציבורית",
                ("רגל", "הלכ"): "רגל"
            }

            for mode in modes:
                for transport in mode:
                    regex = rf"{transport}"
                    if re.search(regex, transcript):
                        result["mode"] = modes[mode]
                        continue

            ner_result = oracle(transcript)

            allowed_ents = ["GPE", "FAC", "PER", "CARDINAL", "NUMBER", "LOC"]
            to_regex = r"^ל"
            from_regex = r"^מ"

            prev_state = None
            destination_words = []
            origin_words = []

            for res in ner_result:
                extracted_word = transcript[res['start']:min(res['end'] + 1, len(transcript))].strip()
                entity = res['entity_group']

                if entity in allowed_ents:
                    if re.search(to_regex, extracted_word) and not destination_words:
                        destination_words.append(extracted_word)
                        prev_state = "destination"
                    elif re.search(from_regex, extracted_word) and not origin_words:
                        origin_words.append(extracted_word)
                        prev_state = "origin"
                    elif prev_state == "destination":
                        destination_words.append(extracted_word)
                    elif prev_state == "origin":
                        origin_words.append(extracted_word)
                else:
                    prev_state = None

            if destination_words:
                result["destination"] = " ".join(destination_words)
            if origin_words:
                result["origin"] = " ".join(origin_words) or "מהמיקום שלי"

            # fallback לזיהוי כתובת אם NER לא הצליח
            if not result["destination"]:
                fallback_address = extract_destination_fallback(transcript)
                if fallback_address:
                    result["destination"] = fallback_address

        elif key == "period":
            if time:
                try:
                    hour = int(time.split(":")[0])
                    if 5 <= hour < 12:
                        result[key] = "בבוקר"
                    elif 12 <= hour < 17:
                        result[key] = "בצהריים"
                    else:
                        result[key] = "בערב"
                except:
                    result[key] = None

        elif key == "contact_name":
            match_contact = re.search(r"את\s+([\w\-׳״\"]+(?:\s[\w\-׳״\"]+)?)", transcript)
            if match_contact:
                result[key] = match_contact.group(1).strip()
            elif full_name:
                result[key] = full_name.split()[0]
            else:
                result[key] = None

        elif key == "message":
            result[key] = msg

        elif key == "time":
            match_time = re.search(r"(?<=בשעה )[^ ]+( [^ ]+)?", transcript)
            if match_time:
                result[key] = match_time.group(0).strip()
            else:
                result[key] = time or ("עוד שעתיים" if "עוד שעתיים" in transcript else None)

        elif key == "date":
            if intent == "weather_query":
                result[key] = date or "היום"
            else:
                match_date = re.search(r"(בחמישי(?: [^ ]+)*?) (?=בשעה|שעה|תור)", transcript)
                if match_date:
                    result[key] = match_date.group(1).strip()
                else:
                    result[key] = date or ("5/5" if "חמישי לחמישי" in transcript else None)

        elif key == "items":
            match = re.search(r"(?:של|שכתוב|שיהיה)[^ ]*\s+(.*)", transcript)
            raw_items = match.group(1) if match else transcript

            garbage_phrases = [
                "סירי", "היי", "אני רוצה", "תכתבי", "שתכתבי", "תכתוב", "תיצרי", "בה", "בו", "בים", "תעשי", "תעשה",
                "תוסיפי", "לי", "חדש", "פתק", "פתקים", "בפתקים", "בפתק"
            ]
            for phrase in garbage_phrases:
                raw_items = raw_items.replace(phrase, "")

            parts = re.split(r"\s+ו\s+|,|\s+ו(?=\S)", raw_items)
            cleaned = [p.strip(" ,.-") for p in parts if len(p.strip()) > 1]

            if len(cleaned) > 1:
                result[key] = ", ".join(cleaned[:-1]) + " ו" + cleaned[-1]
            elif cleaned:
                result[key] = cleaned[0]
            else:
                result[key] = None

        elif key == "type":
            if "סלפי" in transcript or re.search(r"\bה?מצלמה( הקדמית)?\b", transcript):
                result[key] = "סלפי"
            else:
                result[key] = "תמונה"

        elif key == "location":
            result[key] = location or ("תל אביב" if "תל אביב" in transcript else None)

        elif key == "device":
            result[key] = "מצב טיסה" if "מצב טיסה" in transcript else "התקן"

        elif key == "search string":
            result[key] = transcript

    return result

# Extract intent_json
intent_jsons = []
for index, row in df.iterrows():
    transcript = row["transcript_hebrew"]
    intent = row["intent"]
    try:
        ner_result = ner_pipeline(transcript)
        json_obj = build_action_json(transcript, ner_result, intent)
    except Exception as e:
        json_obj = None

    intent_jsons.append(json_obj)

df["intent_json"] = intent_jsons

# Save results
df.to_csv("intent_extracted_cleaned.csv", index=False, encoding="utf-8-sig")
# files.download("intent_extracted_cleaned.csv")


In [9]:
# Extract rephrased sentence from intent_json using templates
def rephrase_from_intent_json(intent, intent_json):
    template = intent_templates.get(intent)
    if not template or not intent_json:
        return None

    # Rename keys if needed
    mapping = {
        "contact_name": "contact_first_name",
        "search": "search string"
    }

    for old_key, new_key in mapping.items():
        if old_key in intent_json and new_key not in intent_json:
            intent_json[new_key] = intent_json[old_key]

    try:
        return template.format(**intent_json)
    except Exception as e:
        return f"שגיאה בפורמט: {e}"

# Create a new column with rephrased sentence
df["rephrased"] = df.apply(lambda row: rephrase_from_intent_json(row["intent"], row["intent_json"]), axis=1)

# Save the new CSV including rephrased column
df.to_csv("rephrased_intents.csv", index=False, encoding="utf-8-sig")
files.download("rephrased_intents.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>