In [1]:
# Import libraries
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tokenizers.decoders import WordPiece

# Load model
model_name = "avichr/heBERT_NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

oracle = pipeline('ner', model='dicta-il/dictabert-ner', aggregation_strategy='simple')
oracle.tokenizer.backend_tokenizer.decoder = WordPiece()

# Load uploaded file
df = pd.read_csv("/content/siri_failed_model_succeeded.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/299k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/735M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.59M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Device set to use cuda:0


In [2]:
# Define intent templates
intent_templates = {
    "transport_query": "את יכולה למצוא לי מסלול {origin} {destination} ב{mode}?",
    "alarm_set": "את יכולה לכוון שעון מעורר לשעה {time} {period}?",
    "call_contact": "את יכולה לחייג ל{contact_name}?",
    "send_message": "את יכולה לשלוח הודעה {message} ל{contact_name}?",
    "calendar_set": " את יכולה ליצור לי פגישה ביומן בתאריך {date} ב{time}?",
    "camera_query": "אני רוצה לצלם {type}",
    "lists_createoradd": "תיצרי לי רשימה שיהיה בה {items}",
    "weather_query": "מה מזג האוויר {date} {location}?",
    "iot_wemo_on": "אני רוצה להדליק {device}",
    "query": "את יכולה לבדוק ולהגיד לי {search string}"
}

# Extract keys from templates
intent_keys = {
    intent: re.findall(r"{(.*?)}", template)
    for intent, template in intent_templates.items()
}

# Helper functions
def extract_entities(ner_result, label):
    return [entity["word"] for entity in ner_result if label in entity["entity_group"]]

def extract_entity(ner_result, label):
    entities = extract_entities(ner_result, label)
    return entities[0] if entities else None


In [3]:
from google.colab import files

def extract_destination_fallback(transcript):
    words = transcript.split()
    ignored_starters = {"להגיע", "ללכת", "לנסוע", "לעלות", "לרדת", "לנסיעה"}
    stopwords = {
        "ש", "אם", "אז", "אבל", "ואם", "או", "על", "של", "עם", "שבו", "ב",
        "האם", "איך", "תגידי", "תגיד", "תגידו", "תוכלי", "תוכל", "לעזור", "לי", "בבקשה", "אתה", "אני", "אולי"
    }

    for i, word in enumerate(words):
        # התחלה תקינה ב-"ל", לא פועל
        if word.startswith("ל") and word not in ignored_starters and len(word) > 2:
            dest_words = [word]
            number_found = False
            for next_word in words[i+1:]:
                # עצור אם זו מילת עצירה (גם אם לא היה מספר)
                if next_word in stopwords:
                    break
                # תוודא שהמספר (למשל 45) ייכנס גם אם אחריו עיר
                dest_words.append(next_word)
                if re.match(r"^\d+$", next_word):
                    number_found = True
            return " ".join(dest_words)
    return ""





def build_action_json(transcript, ner_result, intent):
    intent = intent.strip()
    keys = intent_keys.get(intent, [])
    result = {"intent": intent}

    # Extract entities
    full_name = extract_entity(ner_result, "PER")
    location = extract_entity(ner_result, "LOC")
    date = extract_entity(ner_result, "DATE")
    time = extract_entity(ner_result, "TIME")
    locs = extract_entities(ner_result, "LOC")
    msg = transcript.split(":")[-1] if ":" in transcript else transcript.split(full_name)[-1].strip() if full_name and full_name in transcript else None

    for key in keys:
        if intent == "transport_query":
            result["mode"] = "רכב"
            result["destination"] = ""
            result["origin"] = "מהמיקום שלי"

            modes = {
                ("רכבת", "אוטובוס"): "תחבורה ציבורית",
                ("רגל", "הלכ"): "רגל"
            }

            for mode in modes:
                for transport in mode:
                    regex = rf"{transport}"
                    if re.search(regex, transcript):
                        result["mode"] = modes[mode]
                        continue

            ner_result = oracle(transcript)

            allowed_ents = ["GPE", "FAC", "PER", "CARDINAL", "NUMBER", "LOC"]
            to_regex = r"^ל"
            from_regex = r"^מ"

            prev_state = None
            destination_words = []
            origin_words = []

            for res in ner_result:
                extracted_word = transcript[res['start']:min(res['end'] + 1, len(transcript))].strip()
                entity = res['entity_group']

                if entity in allowed_ents:
                    if re.search(to_regex, extracted_word) and not destination_words:
                        destination_words.append(extracted_word)
                        prev_state = "destination"
                    elif re.search(from_regex, extracted_word) and not origin_words:
                        origin_words.append(extracted_word)
                        prev_state = "origin"
                    elif prev_state == "destination":
                        destination_words.append(extracted_word)
                    elif prev_state == "origin":
                        origin_words.append(extracted_word)
                else:
                    prev_state = None

            if destination_words:
                result["destination"] = " ".join(destination_words)
            if origin_words:
                result["origin"] = " ".join(origin_words) or "מהמיקום שלי"

            # fallback לזיהוי כתובת אם NER לא הצליח
            if not result["destination"]:
                fallback_address = extract_destination_fallback(transcript)
                if fallback_address:
                    result["destination"] = fallback_address



        elif key == "period":
            if any(p in transcript for p in ["בבוקר", "בוקר"]):
                result[key] = "בבוקר"
            elif any(p in transcript for p in ["בערב", "לערב", "ערב"]):
                result[key] = "בערב"
            elif any(p in transcript for p in ["בלילה", "ללילה"]):
                result[key] = "בלילה"
            elif "אחר הצהריים" in transcript or "בצהריים" in transcript:
                result[key] = "בצהריים"
            elif "מחר" in transcript:
                result[key] = "מחר"
            elif "היום" in transcript:
                result[key] = "היום"
            elif time:
                try:
                    hour = int(time.split(":")[0])
                    if 5 <= hour < 12:
                        result[key] = "בבוקר"
                    elif 12 <= hour < 17:
                        result[key] = "בצהריים"
                    else:
                        result[key] = "בערב"
                except:
                    result[key] = None
            else:
                result[key] = None

        elif key == "contact_name":
            if full_name:
                # הסרה של "ל" מיותרת בתחילת שם (אם לא שם משפחה)
                clean_name = re.sub(r"^ל(?=[א-ת])", "", full_name.strip())
                result[key] = clean_name
                result["contact_first_name"] = clean_name.split()[0]
            else:
                # fallback: חיפוש ידני אחרי 'ל' + מילה שהיא שם
                match = re.search(r"ל([א-ת]+(?: [א-ת]+)?)", transcript)
                if match:
                    name = match.group(1).strip()
                    result[key] = name
                    result["contact_first_name"] = name.split()[0]
                else:
                    result[key] = None
                    result["contact_first_name"] = None


        elif key == "message":
            result[key] = msg

        elif key == "time":
            # תבנית רגילה: 6:48, 06.30 וכו'
            time_match = re.search(r"(\d{1,2})[:٫.](\d{2})", transcript)
            if time_match:
                result[key] = f"{int(time_match.group(1)):02}:{time_match.group(2)}"
            else:
                # תבנית פשוטה: שעה אחת בלבד - כמו "בשעה 7," או "לשעה 8"
                simple_time_match = re.search(r"שעה\s+(\d{1,2})[,\.]?", transcript)
                if simple_time_match:
                    hour = int(simple_time_match.group(1))
                    result[key] = f"{hour:02}:00"
                elif time and time != ":":
                    clean_time = re.sub(r"[^0-9:]", "", time)
                    result[key] = clean_time if ":" in clean_time else None
                elif intent == "calendar_set":
                    result[key] = "שעה הנוכחית"
                else:
                    result[key] = None

        elif key == "date":
            if intent == "weather_query":
                result[key] = date or "היום"
            else:
                # מציאת תאריך בתבנית 4.6.25 או 04/06/2025
                date_match = re.search(r"\b(\d{1,2})[./](\d{1,2})[./](\d{2,4})\b", transcript)
                if date_match:
                    day, month, year = date_match.groups()
                    formatted_date = f"{int(day):02}/{int(month):02}/{year}"
                    result[key] = formatted_date
                elif date and date.strip() != "בתאריך":
                    result[key] = date
                else:
                    result[key] = "של היום"


        elif key == "items":
            match = re.search(r"(?:של|שכתוב|שיהיה)[^ ]*\s+(.*)", transcript)
            raw_items = match.group(1) if match else transcript

            garbage_phrases = [
                "סירי", "היי", "אני רוצה", "תכתבי", "שתכתבי", "תכתוב", "תיצרי", "בה", "בו", "בים", "תעשי", "תעשה",
                "תוסיפי", "לי", "חדש", "פתק", "פתקים", "בפתקים", "בפתק"
            ]
            for phrase in garbage_phrases:
                raw_items = raw_items.replace(phrase, "")

            parts = re.split(r"\s+ו\s+|,|\s+ו(?=\S)", raw_items)
            cleaned = [p.strip(" ,.-") for p in parts if len(p.strip()) > 1]

            if len(cleaned) > 1:
                result[key] = ", ".join(cleaned[:-1]) + " ו" + cleaned[-1]
            elif cleaned:
                result[key] = cleaned[0]
            else:
                result[key] = None

        elif key == "type":
            if "סלפי" in transcript or re.search(r"\bה?מצלמה( הקדמית)?\b", transcript):
                result[key] = "סלפי"
            else:
                result[key] = "תמונה"

        elif key == "location":
            result[key] = location or ("תל אביב" if "תל אביב" in transcript else None)

        elif key == "device":
            result[key] = "מצב טיסה" if "מצב טיסה" in transcript else "התקן"

        elif key == "search string":
            result[key] = transcript

    return result

# Extract intent_json
intent_jsons = []
for index, row in df.iterrows():
    transcript = row["transcript_hebrew"]
    intent = row["intent"]
    try:
        ner_result = ner_pipeline(transcript)
        json_obj = build_action_json(transcript, ner_result, intent)
        if intent == "call_contact" and json_obj.get("contact_name", "").startswith("ל"):
            json_obj["contact_name"] = json_obj["contact_name"][1:]
            json_obj["contact_first_name"] = json_obj["contact_name"]

    except Exception as e:
        json_obj = None

    intent_jsons.append(json_obj)

df["intent_json"] = intent_jsons

# Save results
df.to_csv("intent_extracted_cleaned.csv", index=False, encoding="utf-8-sig")
# files.download("intent_extracted_cleaned.csv")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [4]:
# # Extract rephrased sentence from intent_json using templates
# def rephrase_from_intent_json(intent, intent_json):
#     template = intent_templates.get(intent)
#     if not template or not intent_json:
#         return None

#     # Rename keys if needed
#     mapping = {
#         "contact_name": "contact_first_name",
#         "search": "search string"
#     }

#     for old_key, new_key in mapping.items():
#         if old_key in intent_json and new_key not in intent_json:
#             intent_json[new_key] = intent_json[old_key]

#     try:
#         return template.format(**intent_json)
#     except Exception as e:
#         return f"שגיאה בפורמט: {e}"

# # Create a new column with rephrased sentence
# df["rephrased"] = df.apply(lambda row: rephrase_from_intent_json(row["intent"], row["intent_json"]), axis=1)

# # Save the new CSV including rephrased column
# df.to_csv("rephrased_intents.csv", index=False, encoding="utf-8-sig")
# files.download("rephrased_intents.csv")



In [5]:
def rephrase_from_intent_json(intent, intent_json):
    template = intent_templates.get(intent)
    if not template or not intent_json:
        return None

    # Rename keys if needed
    mapping = {
        "contact_name": "contact_first_name",
        "search": "search string"
    }

    for old_key, new_key in mapping.items():
        if old_key in intent_json and new_key not in intent_json:
            intent_json[new_key] = intent_json[old_key]

    # החלפה של None בערכים ריקים (שיהפכו ל־"" במילוי התבנית)
    filled = {
        k: ("" if v is None else v)
        for k, v in intent_json.items()
    }

    try:
        return template.format(**filled).strip()
    except Exception as e:
        return f"שגיאה בפורמט: {e}"


# Create a new column with rephrased sentence
df["rephrased"] = df.apply(lambda row: rephrase_from_intent_json(row["intent"], row["intent_json"]), axis=1)

# Save the new CSV including rephrased column
df.to_csv("rephrased_intents.csv", index=False, encoding="utf-8-sig")
files.download("rephrased_intents.csv")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
df

Unnamed: 0,transcript_hebrew,intent,label,intent_num,intent_json,rephrased
0,שלום אני צריכה להתקשר תקשרי אותי בבקשה לאנה לוי,call_contact,0,2,"{'intent': 'call_contact', 'contact_name': 'אנ...",את יכולה לחייג לאנה לוי?
1,אני צריכה לכוון שעון מעורר למחר לשעה שש ארבעים...,alarm_set,0,0,"{'intent': 'alarm_set', 'time': None, 'period'...",את יכולה לכוון שעון מעורר לשעה מחר?
2,תשלחי בבקשה לישראל זילברמן הודעת טקסט הבאה בבק...,send_message,0,7,"{'intent': 'send_message', 'message': 'הודעת ט...",את יכולה לשלוח הודעה הודעת טקסט הבאה בבקשה תשל...
3,אני צריכה להגיע בבקשה לשדרות רוטשילד שלושים בב...,transport_query,0,8,"{'intent': 'transport_query', 'mode': 'רגל', '...",את יכולה למצוא לי מסלול מהמיקום שלי לשדרות רוט...
4,אני צריכה שתכניסי ליומן ליומן הפגישה עם דוקטור...,calendar_set,0,1,"{'intent': 'calendar_set', 'date': 'בתאריך רבי...",את יכולה ליצור לי פגישה ביומן בתאריך בתאריך רב...
...,...,...,...,...,...,...
145,הייתי רוצה לשאול מה העיר בירה של ליז,query,0,6,"{'intent': 'query', 'search string': 'הייתי רו...",את יכולה לבדוק ולהגיד לי הייתי רוצה לשאול מה ה...
146,הייתי רוצה לשאול מה עיר הבירה של בליז,query,0,6,"{'intent': 'query', 'search string': 'הייתי רו...",את יכולה לבדוק ולהגיד לי הייתי רוצה לשאול מה ע...
147,תרשום לי פתק עם רשימת הקניות שבה יהיה חלב ביצי...,lists_createoradd,0,5,"{'intent': 'lists_createoradd', 'items': 'תרשו...",תיצרי לי רשימה שיהיה בה תרשום עם רשימת הקניו...
148,תעשי לי פתק לקניה רשימה קנייה שבה יהיה חלב ביצ...,lists_createoradd,0,5,"{'intent': 'lists_createoradd', 'items': 'לקני...",תיצרי לי רשימה שיהיה בה לקניה רשימה קנייה ש יה...
