<a href="https://colab.research.google.com/github/nagasatvika/semantic-similarity/blob/main/Word_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("/content/drive/MyDrive/nlp/word.txt",delimiter="\t")

In [4]:
df

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93
...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75


#Using SimLex999

In [7]:
def check_sim(row):
  if row['SimLex999']>5:
    return "are similar"
  else:
    return "are not similar"
df['similarity'] = df.apply(check_sim,axis=1)
print(df['similarity'])

0      are not similar
1          are similar
2          are similar
3          are similar
4      are not similar
            ...       
994    are not similar
995    are not similar
996    are not similar
997    are not similar
998        are similar
Name: similarity, Length: 999, dtype: object


#Wordnet using Wu palmer method


In [8]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [10]:
w1 = df['word1'][:20]
w2 = df['word2'][:20]

def get_first_synset(word):
    synsets = wn.synsets(word)
    return synsets[0]

similarities = []

for word1, word2 in zip(w1, w2):
    syn1 = get_first_synset(word1)
    syn2 = get_first_synset(word2)
    similarity = syn1.wup_similarity(syn2)
    similarities.append((word1,word2,similarity))

results = pd.DataFrame(similarities,columns=['word1','word2',similarity])
print(results)



     word1        word2       0.2
0      old          new  0.222222
1    smart  intelligent  0.166667
2     hard    difficult  1.000000
3    happy     cheerful  0.500000
4     hard         easy  0.500000
5     fast        rapid  0.125000
6    happy         glad  0.142857
7    short         long  0.166667
8   stupid         dumb  0.200000
9    weird      strange  0.153846
10    wide       narrow  0.200000
11     bad        awful  0.250000
12    easy    difficult  0.500000
13     bad     terrible  0.250000
14    hard       simple  0.153846
15   smart         dumb  0.166667
16  insane        crazy  0.166667
17   happy          mad  0.500000
18   large         huge  0.222222
19    hard        tough  0.200000


#Spacy Similarity method

In [11]:
!pip install spacy



In [12]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
import spacy
nlp = spacy.load('en_core_web_md')

In [14]:
w1 = df['word1'][:20]
w1 = df['word2'][:20]

similarities = []

for word1,word2 in zip(w1,w2):
  word1 = word1.lower()
  word2 = word2.lower()
  W1 = nlp(word1)
  W2 = nlp(word2)
  Similarity = W1.similarity(W2)
  similarities.append((word1,word2,Similarity))
result = pd.DataFrame(similarities,columns=['word1','word2','Similarity'])
print(result)

          word1        word2  Similarity
0           new          new         1.0
1   intelligent  intelligent         1.0
2     difficult    difficult         1.0
3      cheerful     cheerful         1.0
4          easy         easy         1.0
5         rapid        rapid         1.0
6          glad         glad         1.0
7          long         long         1.0
8          dumb         dumb         1.0
9       strange      strange         1.0
10       narrow       narrow         1.0
11        awful        awful         1.0
12    difficult    difficult         1.0
13     terrible     terrible         1.0
14       simple       simple         1.0
15         dumb         dumb         1.0
16        crazy        crazy         1.0
17          mad          mad         1.0
18         huge         huge         1.0
19        tough        tough         1.0


#Hamming Distance


In [15]:
from scipy.spatial import distance
import numpy as np
w1 = df['word1'][:20]
w2 = df['word2'][:20]
similarities = []
for word1,word2 in zip(w1,w2):
  max_len = max(len(word1),len(word2))
  word1 = word1.ljust(max_len)
  word2 = word2.ljust(max_len)
  Normalized_HD = distance.hamming(list(word1),list(word2))
  hamming = Normalized_HD*max_len
  similarities.append((word1,word2,hamming))
result = pd.DataFrame(similarities,columns=['word1','word2','hamming'])
print(result)


          word1        word2  hamming
0           old          new      3.0
1   smart        intelligent     11.0
2     hard         difficult      9.0
3      happy        cheerful      8.0
4          hard         easy      3.0
5         fast         rapid      4.0
6         happy        glad       5.0
7         short        long       5.0
8        stupid       dumb        6.0
9       weird        strange      7.0
10       wide         narrow      6.0
11        bad          awful      5.0
12    easy         difficult      9.0
13     bad          terrible      8.0
14       hard         simple      6.0
15        smart        dumb       5.0
16       insane       crazy       6.0
17        happy        mad        4.0
18        large        huge       5.0
19        hard         tough      5.0


In [1]:
%pip install sentence-transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
w1 = df['word1'][:20]
w2 = df['word2'][:20]
embeddings1 = model.encode(w1, convert_to_tensor=True)
embeddings2 = model.encode(w2, convert_to_tensor=True)
cosine_score = util.cos_sim(embeddings1,embeddings2)
for i in range(20):
  cosine_score_value = cosine_score[i][i].item()
  print("{} \t\t {} \t\t Score:{:.4f}".format(w1[i], w2[i],cosine_score_value))

old 		 new 		 Score:0.5034
smart 		 intelligent 		 Score:0.7494
hard 		 difficult 		 Score:0.8246
happy 		 cheerful 		 Score:0.5541
hard 		 easy 		 Score:0.7239
fast 		 rapid 		 Score:0.7319
happy 		 glad 		 Score:0.5697
short 		 long 		 Score:0.6555
stupid 		 dumb 		 Score:0.7669
weird 		 strange 		 Score:0.8870
wide 		 narrow 		 Score:0.8235
bad 		 awful 		 Score:0.6512
easy 		 difficult 		 Score:0.7745
bad 		 terrible 		 Score:0.7928
hard 		 simple 		 Score:0.5171
smart 		 dumb 		 Score:0.6009
insane 		 crazy 		 Score:0.8316
happy 		 mad 		 Score:0.3313
large 		 huge 		 Score:0.8102
hard 		 tough 		 Score:0.7784
