# English Learning App - Word Sense Filtering
This notebook filters words from song lyrics that don't match their intended semantic meaning based on translation into Hebrew.

In [None]:
import pandas as pd
import ast
import re
import os
from sentence_transformers import SentenceTransformer, util
from googletrans import Translator
import time
from difflib import SequenceMatcher
from category_in_hebrew import intended_hebrew

# --- General Configuration ---
BATCH_SIZE = 1000
INPUT_PATH = '../../data/songs_category_before_tran1.csv'
OUTPUT_PATH = '../../data/filtered_songs_disambiguated_part1.csv'

# --- Load model and translation tools ---
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
translator = Translator()
translation_cache = {}

# Retry mechanism for Google Translate to handle rate limits/errors
def translate_with_retry(sentence, retries=3, delay=1):
    for attempt in range(retries):
        try:
            return translator.translate(sentence, src='en', dest='he').text
        except Exception as e:
            print(f"Translation failed (attempt {attempt+1}): {e}")
            time.sleep(delay)
    return None

# Use cache to avoid redundant translations
def translate_with_cache(sentence):
    if sentence in translation_cache:
        return translation_cache[sentence]
    translated = translate_with_retry(sentence)
    if translated:
        translation_cache[sentence] = translated
    return translated

# Basic sentence splitter
def split_sentences(text):
    return re.split(r'[.!?\n]', text)

# Check if the word is used in the intended sense
def is_correct_sense(word, sentence, intended_hebrew_meaning, threshold=0.75):
    translated = translate_with_cache(sentence)
    if not translated:
        return False

    # Direct match
    if intended_hebrew_meaning in translated:
        print(f"\n✔️ {word.upper()} found via direct match: '{intended_hebrew_meaning}' in '{translated}'")
        return True

    # Fuzzy string match
    similarity_str = SequenceMatcher(None, translated, intended_hebrew_meaning).ratio()
    if similarity_str > 0.85:
        print(f"\n✔️ {word.upper()} found via fuzzy string match ({similarity_str:.2f})")
        return True

    # Semantic similarity using sentence embeddings
    try:
        emb_sentence = model.encode(translated, convert_to_tensor=True)
        emb_meaning = model.encode(intended_hebrew_meaning, convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(emb_sentence, emb_meaning).item()
        print(f"\nWord: {word}")
        print(f"Original Sentence: {sentence}")
        print(f"Translated (HE): {translated}")
        print(f"Intended (HE): {intended_hebrew_meaning}")
        print(f"Similarity: {similarity:.4f} — {'✔️ KEEP' if similarity >= threshold else '❌ SKIP'}")
        return similarity >= threshold
    except Exception as e:
        print(f"Embedding failed: {e}")
        return False

# --- Load data ---
df = pd.read_csv(INPUT_PATH)

# Load previously processed data if exists
if os.path.exists(OUTPUT_PATH):
    existing_df = pd.read_csv(OUTPUT_PATH)
    processed_count = len(existing_df)
else:
    existing_df = pd.DataFrame()
    processed_count = 0

print(f"Processed so far: {processed_count}")

# Select the next batch of songs to process
df_batch = df.iloc[processed_count:processed_count + BATCH_SIZE]

if df_batch.empty:
    print("✅ All songs processed.")
    exit()

filtered_rows = []

# --- Process current batch ---
for idx, row in df_batch.iterrows():
    lyrics = row['cleaned_lyrics'].lower()
    updated_groups = []
    category_groups = ast.literal_eval(row['category_words'])
    sentences = split_sentences(lyrics)

    for group in category_groups:
        filtered_group = []
        for word in group:
            found = False
            for category_dict in intended_hebrew.values():
                if word in category_dict:
                    intended_hebrew_meaning = category_dict[word]
                    sentence = next((s.strip() for s in sentences if word in s), None)
                    if sentence and is_correct_sense(word, sentence, intended_hebrew_meaning):
                        filtered_group.append(word)
                    found = True
                    break
            if not found:
                filtered_group.append(word)

        if filtered_group:
            updated_groups.append(filtered_group)

    row['category_words'] = str(updated_groups)
    filtered_rows.append(row)

# --- Save results ---
filtered_df = pd.DataFrame(filtered_rows)
combined_df = pd.concat([existing_df, filtered_df], ignore_index=True)
combined_df.to_csv(OUTPUT_PATH, index=False)

print(f"✅ Batch processed and saved: {processed_count} ➡ {processed_count + len(filtered_df)}")


  from .autonotebook import tqdm as notebook_tqdm


Processed so far: 1000

✔️ AFRAID found via direct match: 'מפחד' in 'אבל אני מפחד הטעם שלי לא מזוקק'

✔️ SORROW found via direct match: 'צער' in 'ולדר לאודר ישב בצער'

Word: encouraged
Original Sentence: encouraged fear when so inclined
Translated (HE): עודד פחד כאשר נוטה כל כך
Intended (HE): מעודד
Similarity: 0.8296 — ✔️ KEEP

✔️ SKIN found via direct match: 'עור' in 'תן לאור השמש לזוהר מתחת לעור שלי'

✔️ EYES found via direct match: 'עיניים' in 'תפור את העיניים'

✔️ EYE found via direct match: 'עין' in 'כאשר העננים מנשקים את העין שלך'

Word: teeth
Original Sentence: for all his teeth had gone
Translated (HE): על כל שיניו הלכו
Intended (HE): שיניים
Similarity: 0.8930 — ✔️ KEEP

✔️ HEAD found via direct match: 'ראש' in 'מישהו ניתק את ראשו'

✔️ NERVES found via direct match: 'עצבים' in 'תן לסוכנים לרקוד על העצבים שלי'

✔️ THUMB found via direct match: 'אגודל' in 'הרם את האגודל לשמיים'

Word: schedule
Original Sentence: i got no free days on my busy schedule
Translated (HE): אין לי ימים 

In [3]:
import pandas as pd
input_file = '../../data/merged_song_with_level.csv'
df = pd.read_csv(input_file)
print(f"Total songs in dataset: {input_file}: {len(df)}")

Total songs in dataset: ../../data/merged_song_with_level.csv: 59945
