# English Learning App - Word Sense Filtering
This notebook filters words from song lyrics that don't match their intended semantic meaning based on translation into Hebrew.

In [None]:
import pandas as pd
import ast
import re
from sentence_transformers import SentenceTransformer, util
from googletrans import Translator
import time
from difflib import SequenceMatcher

# Load data
df = pd.read_csv('../data/filtered.csv')
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
translator = Translator()
translation_cache = {}

def translate_with_retry(sentence, retries=3, delay=1):
    for attempt in range(retries):
        try:
            return translator.translate(sentence, src='en', dest='he').text
        except Exception as e:
            print(f"Translation failed (attempt {attempt+1}): {e}")
            time.sleep(delay)
    return None

def translate_with_cache(sentence):
    if sentence in translation_cache:
        return translation_cache[sentence]
    translated = translate_with_retry(sentence)
    if translated:
        translation_cache[sentence] = translated
    return translated

# Intended Hebrew meanings
intended_hebrew = {
    'kind': 'אדיב',
    'bitter': 'מר',
    'shame': 'בושה',
    'tired': 'עייף',
    'loved': 'אהוב',
    'feelings': 'רגשות',
    'strong': 'חזק',
    'in love with': 'מאוהב',
    'love': 'אהבה',
    'impatient': 'חסר סבלנות',
    # Add more as needed
}

# Basic sentence splitter using regex
def split_sentences(text):
    return re.split(r'[.!?\n]', text)

# Function to check if the word is used in the correct sense
def is_correct_sense(word, sentence, intended_hebrew_meaning, threshold=0.75):
    translated = translate_with_cache(sentence)
    if not translated:
        return False

    # Check for direct or fuzzy match of Hebrew meaning in translated sentence
    if intended_hebrew_meaning in translated:
        print(f"\n✔️ {word.upper()} found via direct match: '{intended_hebrew_meaning}' in '{translated}'")
        return True

    # Try fuzzy match (substring with high similarity)
    similarity_str = SequenceMatcher(None, translated, intended_hebrew_meaning).ratio()
    if similarity_str > 0.85:  # tweak as needed
        print(f"\n✔️ {word.upper()} found via fuzzy string match ({similarity_str:.2f})")
        return True

    # Fallback to semantic similarity
    try:
        emb_sentence = model.encode(translated, convert_to_tensor=True)
        emb_meaning = model.encode(intended_hebrew_meaning, convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(emb_sentence, emb_meaning).item()

        print(f"\nWord: {word}")
        print(f"Original Sentence: {sentence}")
        print(f"Translated (HE): {translated}")
        print(f"Intended (HE): {intended_hebrew_meaning}")
        print(f"Similarity: {similarity:.4f} — {'✔️ KEEP' if similarity >= threshold else '❌ SKIP'}")

        return similarity >= threshold
    except Exception as e:
        print(f"Embedding failed: {e}")
        return False


filtered_rows = []

for idx, row in df.iterrows():
    lyrics = row['cleaned_lyrics'].lower()
    updated_groups = []

    category_groups = ast.literal_eval(row['category_words'])
    sentences = split_sentences(lyrics)

    for group in category_groups:
        filtered_group = []
        for word in group:
            if word not in intended_hebrew:
                filtered_group.append(word)
                continue

            sentence = next((s.strip() for s in sentences if word in s), None)
            if sentence and is_correct_sense(word, sentence, intended_hebrew[word]):
                filtered_group.append(word)

        if filtered_group:
            updated_groups.append(filtered_group)

    row['category_words'] = str(updated_groups)
    filtered_rows.append(row)

# Save results
filtered_df = pd.DataFrame(filtered_rows)
filtered_df.to_csv('../data/filtered_songs_disambiguated.csv' , index=False)




Translation failed (attempt 1): [Errno 104] Connection reset by peer
Translation failed (attempt 2): [Errno 104] Connection reset by peer

✔️ FEELINGS found via direct match: 'רגשות' in 'איבוד שליטה ברגשותינו'

✔️ SHAME found via direct match: 'בושה' in 'האם אתה שפוי איפה הבושה'

Word: kind
Original Sentence: what kind of world do we live in
Translated (HE): באיזה סוג עולם אנו חיים
Intended (HE): אדיב
Similarity: 0.7474 — ❌ SKIP
Translation failed (attempt 1): [Errno 104] Connection reset by peer

✔️ IN LOVE WITH found via direct match: 'מאוהב' in 'ולמה אני כל כך מאוהב בך'

✔️ LOVE found via direct match: 'אהבה' in 'פעם הייתה לי אהבה סודית'

✔️ IMPATIENT found via direct match: 'חסר סבלנות' in 'נעשה חסר סבלנות להיות חופשי'

Word: kind
Original Sentence: or are we meant to be kind
Translated (HE): או שאנו אמורים להיות חביבים
Intended (HE): אדיב
Similarity: 0.8110 — ✔️ KEEP
Translation failed (attempt 1): [Errno 104] Connection reset by peer
Translation failed (attempt 2): [Errno 104] Co