In [9]:
import pandas as pd
# import spacy
# from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import KeyedVectors

# Load pre-trained word vectors (Google News Word2Vec format)
# This will download ~1.5GB on first run
import gensim.downloader as api
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


  from tqdm.autonotebook import tqdm, trange


In [12]:
df = pd.read_csv("../exp_files/pilot.csv")
# a2 = pd.read_csv("../exp_files/total/exp-a2-pairs.csv")
# n = pd.read_csv("../exp_files/total/exp-n-pairs.csv")
# n2 = pd.read_csv("../exp_files/total/exp-n2-pairs.csv")
# s = pd.read_csv("../exp_files/total/exp-s-pairs.csv")
# v = pd.read_csv("../exp_files/total/exp-v-pairs.csv")
# v2 = pd.read_csv("../exp_files/total/exp-v2-pairs.csv")
# df = pd.concat([v,v2,n,n2,s,a,a2])

# Differences between Word2Vec and SentenceTransformer embeddings for similarity
Aspect	Word2Vec Google News (300d)	SentenceTransformer all-MiniLM-L6-v2 (384d)
Type of embedding	Word-level embedding	Sentence / phrase embedding (can also be single words)
Training data	Google News corpus (~100 billion tokens)	Large-scale datasets with sentences and semantic labels (e.g. NLI, STS)
Vector size	300 dimensions	384 dimensions
Context	Static embeddings: Each word has one vector regardless of context	Contextual embeddings (fine-tuned for sentence similarity)
Usage	Good for general word similarity and analogy	Excellent for sentence, phrase, and word similarity — captures semantics better
Similarity measure	Cosine similarity on static vectors	Cosine similarity on context-aware vectors
Out-of-vocabulary handling	Words not in vocab get no vector	Can embed any text string, including OOV words
Performance	Fast, lightweight, easy to use	Slightly heavier model, but more powerful and versatile
Meaning captured	Word co-occurrence patterns	Semantic meaning in context, better for polysemy and phrase meaning

In simpler terms:
Word2Vec gives you a single fixed vector per word, learned from word co-occurrence statistics in a huge news corpus. It doesn’t consider context — the vector for “bank” is the same whether it’s a river bank or a financial bank.

SentenceTransformer embeddings are generated by a transformer model trained on tasks that require understanding sentence meaning. Even a single word embedding reflects some context and semantic nuance. It works better for phrases, sentences, or words with multiple meanings.

When to use which?
Task	Recommended model
Simple word similarity or analogies	Word2Vec Google News
Sentence or phrase similarity	SentenceTransformer (MiniLM or others)
Handling ambiguous or out-of-vocab words	SentenceTransformer
Downstream NLP tasks requiring semantic understanding

# Get similarity with word-2-vec

In [18]:

model1 = api.load("word2vec-google-news-300")

In [19]:
def get_similarity(w1, w2):
    if w1 in model1 and w2 in model1:
        return model1.similarity(w1, w2)
    else:
        return None  # or 0, or np.nan
df["W2VSimilarity"] = df.apply(lambda row: get_similarity(row["Word1"], row["Word2"]), axis=1)

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,Word1,Word2,FeatureCombo1,FeatureCombo2,FeatureMatch,W2VSimilarity
0,95,clean,burn,physical-positive,physical-negative,ConceptualMatchingOnly,
1,30,caress,rust,physical-positive,physical-negative,ConceptualMatchingOnly,
2,158,reassure,criticize,psychological-positive,psychological-negative,ConceptualMatchingOnly,
3,115,encourage,embarrass,psychological-positive,psychological-negative,ConceptualMatchingOnly,
4,69,sculpt,corrode,physical-positive,physical-negative,ConceptualMatchingOnly,


# Try with transformers

In [20]:
# Load model
model2 = SentenceTransformer("all-MiniLM-L6-v2")


In [21]:
# Compute similarities
def get_similarity(w1, w2):
    v1 = model2.encode(w1)
    v2 = model2.encode(w2)
    return cosine_similarity([v1], [v2])[0][0]

df["CosineSimilarity"] = df.apply(lambda row: get_similarity(row["Word1"], row["Word2"]), axis=1)


In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,Word1,Word2,FeatureCombo1,FeatureCombo2,FeatureMatch,W2VSimilarity,CosineSimilarity
0,95,clean,burn,physical-positive,physical-negative,ConceptualMatchingOnly,0.223062,0.407466
1,30,caress,rust,physical-positive,physical-negative,ConceptualMatchingOnly,0.125289,0.232726
2,158,reassure,criticize,psychological-positive,psychological-negative,ConceptualMatchingOnly,0.26765,0.317731
3,115,encourage,embarrass,psychological-positive,psychological-negative,ConceptualMatchingOnly,0.298619,0.245418
4,69,sculpt,corrode,physical-positive,physical-negative,ConceptualMatchingOnly,0.178691,0.186632


In [23]:
df.to_csv("../exp_files/pilot_similarity.csv")