# Libraries and dependencies

In [None]:
pip install -r requirements2.txt

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
from tqdm import tqdm
from InstructorEmbedding import INSTRUCTOR

# Hot word similarity

## Load dev dataset

In [None]:
# Normalize function
def normalize_text(text):
    text = re.sub(r"<unk>", "", text, flags=re.IGNORECASE)
    text = text.upper().strip()
    text = re.sub(r"[^A-Z' ]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [None]:
# Load dataset from your path
df = pd.read_csv("../data/cv-valid-dev.csv")
df["normalized_text"] = df["text"].apply(normalize_text)

## Load embedding model

In [None]:
# Load Instructor Model
model = INSTRUCTOR("hkunlp/instructor-large")

## Define task and hot words

In [None]:
# Task and Hotwords
task = "Retrieve sentences related to the hotword" # Refer to https://huggingface.co/hkunlp/instructor-large for more information
hotwords = ["BE CAREFUL", "DESTROY", "STRANGER"]
hotword_embeddings = model.encode([[task, word] for word in hotwords])

## Set similarity threshold

In [None]:
# Set similarity threshold
SIMILARITY_THRESHOLD = 0.85 # Arbitrary value, higher value would represent more similar, lower value would represent less similar

## Compute similarity and classify if phrase is similar or not similar

In [None]:
# Compute similarity for each row
similar_flags = []

for text in tqdm(df["normalized_text"], desc="Computing Similarities"):
    embedding = model.encode([[task, text]])[0]
    sims = cosine_similarity([embedding], hotword_embeddings)[0]
    is_similar = any(s >= SIMILARITY_THRESHOLD for s in sims)
    similar_flags.append(is_similar)

# Add result column
df["similarity"] = similar_flags

# Save the updated DataFrame
output_path = "../hotword-detection/cv-valid-dev-similarity.csv"
df.to_csv(output_path, index=False)
print(f"Saved updated file to {output_path}")
