In [1]:
!pip install sentence_transformers



In [2]:
from sentence_transformers import SentenceTransformer, util

In [3]:
model = SentenceTransformer('all-mpnet-base-v2')

In [4]:
def cosine_distance(vec1, vec2):
    return 1 - util.cos_sim(vec1, vec2).item()

In [5]:
def word_distance(word1, word2):
    embeddings = model.encode([word1, word2], normalize_embeddings=True)
    return cosine_distance(embeddings[0], embeddings[1])

In [6]:
def batch_word_distance(word, word_list):
    embeddings = model.encode([word] + word_list, normalize_embeddings=True)
    word_vec = embeddings[0]
    word_list_vecs = embeddings[1:]
    distances = [cosine_distance(word_vec, w) for w in word_list_vecs]
    return distances

# Demo
- SBERT converts any text (a word, phrase, slogan, brand name, etc.) into a high-dimensional vector (usually 768-D).
- Two semantically similar phrases (e.g., “Apple” and “innovation”) will have vectors close together in that space; dissimilar ones (e.g., “Apple” and “diesel engine”) will be far apart.
- We can quantify that closeness with cosine similarity or cosine distance.

$$\text{Cosine distance} = 1 - \frac{v_1 \cdot v_2}{||v_1|| \, ||v_2||}$$

In [7]:
brand = "Apple"
targets = ["innovation", "creativity", "battery", "diesel engine"]

distances = batch_word_distance(brand, targets)
print(f"Distances from '{brand}':")
for target, dist in zip(targets, distances):
    print(f"  to '{target}': {dist:.4f}")

Distances from 'Apple':
  to 'innovation': 0.6278
  to 'creativity': 0.7024
  to 'battery': 0.5826
  to 'diesel engine': 0.8319


## Create a lexicon of words

In [8]:
import pandas as pd
import csv

In [9]:
# Load candidate lexicon with metadata
# columns required: word, length, zipf, concreteness, valence, arousal, pos
# lex = pd.read_csv("lexicon_with_norms.csv")
# # Basic filters
# lex = lex.query("4 <= length <= 9 and 2.5 <= zipf <= 5").copy()
# lex = lex[~lex.word.str.contains(r"[^a-zA-Z-]")]


In [10]:
swow = pd.read_csv(
    "./lexicon/SWOW-EN18/strength.SWOW-EN.R123.20180827.csv",
    sep="\t",
    quoting=csv.QUOTE_NONE,
    escapechar=None,
    engine="python"
)

In [11]:
swow

Unnamed: 0,cue,response,R123,N,R123.Strength
0,a,one,32,270,0.118519
1,a,the,27,270,0.100000
2,a,an,19,270,0.070370
3,a,b,14,270,0.051852
4,a,single,9,270,0.033333
...,...,...,...,...,...
1389706,zucchini,thin,1,283,0.003534
1389707,zucchini,veggie,1,283,0.003534
1389708,zucchini,veggies,1,283,0.003534
1389709,zucchini,weapon,1,283,0.003534
