In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from langdetect import detect
import re
# load models
embed_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")

print("embedding model loaded. dim =", embed_model.get_sentence_embedding_dimension())


embedding model loaded. dim = 384


In [6]:
def clean_text(text):
    """Basic cleaning: lowercase, remove special chars, normalize spaces"""
    text = text.strip().lower()
    text = re.sub(r"[^a-zA-Zàâçéèêëîïôûùüÿñæœ0-9\u0600-\u06FF\s']", " ", text)  
    # keep Latin, Arabic (0600-06FF), digits
    text = re.sub(r"\s+", " ", text)
    return text

def detect_language(text):
    """Detect language: English, French, Arabic, Darija"""
    try:
        lang = detect(text)
        if lang in ("en", "fr", "ar"):
            if lang == "ar":
                # crude heuristic: if mix of Arabic + Latin chars → Darija
                if any(c in text for c in "abcdefghijklmnopqrstuvwxyz"):
                    return "darija"
                return "ar"
            return lang
    except:
        pass
    return "unknown"

def tokenize_text(text, lang="en"):
    """Tokenize text based on detected language"""
    if lang == "en":
        nlp = nlp_en
    elif lang == "fr":
        nlp = nlp_fr
    else:
        # fallback regex for Darija/Arabic
        return re.findall(r"\w+", text)
    
    doc = nlp(text)
    return [tok.text for tok in doc if not tok.is_punct and not tok.is_space]


In [None]:
sentences = [
    "I feel very sad today and tired",
    "I am so happy and excited!",
    "Je me sens triste et isolé",
    "Je suis stressé par mes examens",
    "I am worried about tomorrow",
    "Je suis très content aujourd'hui",
    "انا حزين اليوم",           # Arabic
    "je suis stressed ",           
    "ana dayekh bzaf had lyoum" # Darija in Latin letters
]

for s in sentences:
    lang = detect_language(s)
    print(f"{s:40} -> {lang:8} -> {tokenize_text(s, lang)}")


I feel very sad today and tired          -> en       -> ['I', 'feel', 'very', 'sad', 'today', 'and', 'tired']
I am so happy and excited!               -> en       -> ['I', 'am', 'so', 'happy', 'and', 'excited']
Je me sens triste et isolé               -> fr       -> ['Je', 'me', 'sens', 'triste', 'et', 'isolé']
Je suis stressé par mes examens          -> fr       -> ['Je', 'suis', 'stressé', 'par', 'mes', 'examens']
I am worried about tomorrow              -> en       -> ['I', 'am', 'worried', 'about', 'tomorrow']
Je suis très content aujourd'hui         -> fr       -> ['Je', 'suis', 'très', 'content', "aujourd'hui"]
انا حزين اليوم                           -> ar       -> ['انا', 'حزين', 'اليوم']
i suis stressed                          -> unknown  -> ['i', 'suis', 'stressed']
ana dayekh bzaf had lyoum                -> unknown  -> ['ana', 'dayekh', 'bzaf', 'had', 'lyoum']


In [10]:
cleaned = [clean_text(s) for s in sentences]
embeddings = embed_model.encode(cleaned, show_progress_bar=True)
print("✅ Embeddings shape:", embeddings.shape)

# pairwise similarity
sim = cosine_similarity(embeddings)
np.set_printoptions(precision=2, suppress=True)
print("🔎 Pairwise similarity matrix:\n", sim)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Embeddings shape: (9, 384)
🔎 Pairwise similarity matrix:
 [[1.   0.38 0.7  0.45 0.44 0.49 0.8  0.56 0.12]
 [0.38 1.   0.21 0.15 0.28 0.77 0.26 0.32 0.1 ]
 [0.7  0.21 1.   0.42 0.21 0.19 0.66 0.46 0.03]
 [0.45 0.15 0.42 1.   0.3  0.17 0.44 0.77 0.12]
 [0.44 0.28 0.21 0.3  1.   0.27 0.47 0.34 0.01]
 [0.49 0.77 0.19 0.17 0.27 1.   0.42 0.31 0.08]
 [0.8  0.26 0.66 0.44 0.47 0.42 1.   0.48 0.08]
 [0.56 0.32 0.46 0.77 0.34 0.31 0.48 1.   0.18]
 [0.12 0.1  0.03 0.12 0.01 0.08 0.08 0.18 1.  ]]


In [12]:
def predict_nearest_label(input_text, candidates, k=1):
    """Find nearest label using cosine similarity"""
    emb = embed_model.encode([clean_text(input_text)])
    cand_emb = embed_model.encode([clean_text(c) for c in candidates])
    sims = cosine_similarity(emb, cand_emb)[0]
    best_idx = sims.argsort()[::-1][:k]
    return [(candidates[i], float(sims[i])) for i in best_idx]

candidates = ["sad", "happy", "anxious", "neutral", "stressed"]

tests = [
    "I'm feeling hopeless and sad today",
    "Je suis super content!",
    "s7i7 mlih bzaf",
    "انا متوتر بزاف"
]

for t in tests:
    print(f"{t:35} -> {predict_nearest_label(t, candidates)}")


I'm feeling hopeless and sad today  -> [('anxious', 0.448197603225708)]
Je suis super content!              -> [('happy', 0.7933486104011536)]
s7i7 mlih bzaf                      -> [('sad', 0.20121295750141144)]
انا متوتر بزاف                      -> [('anxious', 0.6432017087936401)]
