# Word Similarity

Student Name: Muhammad Atif

Python version: 3.6

In [1]:
import nltk
import operator
import math
from nltk.corpus import brown
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.spatial.distance import cosine as cos_distance
from gensim.models import Word2Vec
from scipy.stats.stats import pearsonr

# Load 'combined.tab' file in dictionary
with open('C:\\Users\\ma_at\\Desktop\\Web Search and Text Analysis - COMP90042\\Assignments\\Assignment 2\\combined.tab') as tabFile:
    next(tabFile)
    tabSepWords = (line.split('\t') for line in tabFile)
    wordSimDict = {(words[0],words[1]):float(words[2]) for words in tabSepWords}  
    
# for each paragraph in brown corpus, store a list of lower-cased, lemmatized word types
lemmatizer = WordNetLemmatizer()
brownParas = []
for paragraphs in brown.paras():
    wordTypes = set()
    wordTypes.update([lemmatizer.lemmatize(words.lower()) for sentences in paragraphs for words in sentences])
    brownParas.append(wordTypes)

# create a dictionary of document frequency for word types in brown corpus
wordTypeDocFreqDict = {}
for paragraphs in brownParas:
    for word in paragraphs:
        wordTypeDocFreqDict[word] = wordTypeDocFreqDict.get(word,0) + 1

# filter word pairs where frequency of either one of them is less than 10
for word1, word2 in list(wordSimDict):
    if (wordTypeDocFreqDict.get(word1,0) < 10) | (wordTypeDocFreqDict.get(word2,0) < 10):
        wordSimDict.pop((word1, word2), None)

# store single noun primary sense of words in dictionary, and
# filter word pairs where single noun primary sense is not found
primarySenseSynsetDict = {}
for wordPairs in list(wordSimDict):
    for word in wordPairs:
        haveNounPrimarySense = False
        synsets = wn.synsets(word)
        if (len(synsets) == 1 and synsets[0].pos() == 'n'):
            haveNounPrimarySense = True
            primarySenseSynsetDict[word] = synsets[0]
        elif (len(synsets) > 1):
            lemmaCounts = {}
            for synset in synsets:
                for lemma in synset.lemmas():
                    lemmaName = lemma.name()
                    if lemmaName == word:
                        lemmaCounts[lemma, synset] = lemma.count()  
            lemmaCounts = sorted(lemmaCounts.items(), key=operator.itemgetter(1), reverse=True)            
            if (len(lemmaCounts) == 1) and (lemmaCounts[0][0][1].pos() == 'n') and (lemmaCounts[0][1] >= 5):
                haveNounPrimarySense = True
            elif len(lemmaCounts) > 1:
                if(lemmaCounts[0][1] >= 5) and (lemmaCounts[0][1] >= lemmaCounts[1][1]*5) and (lemmaCounts[0][0][1].pos() == 'n'):
                    haveNounPrimarySense = True
            if haveNounPrimarySense == True:   
                primarySenseSynsetDict[word] = lemmaCounts[0][0][1]
        if haveNounPrimarySense == False:
            wordSimDict.pop(wordPairs, None) 
            break

print(wordSimDict.keys())

dict_keys([('professor', 'doctor'), ('stock', 'egg'), ('baby', 'mother'), ('car', 'automobile'), ('journey', 'voyage'), ('coast', 'shore'), ('brother', 'monk'), ('journey', 'car'), ('coast', 'hill'), ('monk', 'slave'), ('coast', 'forest'), ('psychology', 'doctor'), ('psychology', 'mind'), ('psychology', 'health'), ('psychology', 'science'), ('computer', 'laboratory'), ('canyon', 'landscape'), ('century', 'year'), ('doctor', 'personnel'), ('school', 'center'), ('word', 'similarity'), ('hotel', 'reservation'), ('type', 'kind'), ('equipment', 'maker'), ('luxury', 'car'), ('soap', 'opera'), ('planet', 'people')])


Now we will create several dictionaries with similarity scores for pairs of words in our test set. The first of these is the Wu-Palmer scores derived from the hypernym relationships in WordNet, which we will calculate using the primary sense for each word derived above.

In [2]:
# create dictionary of wordpair/Wu-Palmer-similarity mappings for filtered word pairs
wuPalmerSimilarityDict={}
for word1, word2 in wordSimDict:
    wuPalmerSimilarityDict[word1,word2] = primarySenseSynsetDict[word1].wup_similarity(primarySenseSynsetDict[word2])

print(wuPalmerSimilarityDict)

{('professor', 'doctor'): 0.5, ('stock', 'egg'): 0.11764705882352941, ('baby', 'mother'): 0.5, ('car', 'automobile'): 1.0, ('journey', 'voyage'): 0.8571428571428571, ('coast', 'shore'): 0.9090909090909091, ('brother', 'monk'): 0.5714285714285714, ('journey', 'car'): 0.09523809523809523, ('coast', 'hill'): 0.6666666666666666, ('monk', 'slave'): 0.6666666666666666, ('coast', 'forest'): 0.16666666666666666, ('psychology', 'doctor'): 0.1111111111111111, ('psychology', 'mind'): 0.5714285714285714, ('psychology', 'health'): 0.21052631578947367, ('psychology', 'science'): 0.9411764705882353, ('computer', 'laboratory'): 0.35294117647058826, ('canyon', 'landscape'): 0.3333333333333333, ('century', 'year'): 0.8333333333333334, ('doctor', 'personnel'): 0.13333333333333333, ('school', 'center'): 0.13333333333333333, ('word', 'similarity'): 0.3333333333333333, ('hotel', 'reservation'): 0.375, ('type', 'kind'): 0.9473684210526315, ('equipment', 'maker'): 0.5, ('luxury', 'car'): 0.1111111111111111, (

Next, we will calculate Positive PMI (PPMI) for our word pairs using statistics derived from the Brown using the same set up as we did to calculate document frequency above: paragraphs as documents, lemmatized, lower-cased, and with term frequency information removed by conversion to Python sets. We will use the basic method for calculating PPMI which is appropriate for any possible definition of co-occurrence (here, appearing in the same paragraph), but we will only calculate PPMI for the words in our test set. We will avoid building the entire co-occurrence matrix, instead we will keep track of the sums for the probabilities as we go along.

In [3]:
# create dictionary of wordpair/PPMI-similarity mappings for filtered word pairs
pmiSimilarityDict={}
totalParasCount = float(len(brownParas))
for word1, word2 in wordSimDict:
    wordCount1 = 0
    wordCount2 = 0
    bothWordCount = 0
    for paras in brownParas:
        if word1 in paras:
            wordCount1 += 1
            if word2 in paras:
                bothWordCount += 1
        if word2 in paras:
            wordCount2 += 1
    probCalc = (bothWordCount/totalParasCount)/((wordCount1/totalParasCount)*(wordCount2/totalParasCount))
    pmiSimilarityDict[word1, word2] = 0.0 if probCalc==0 else math.log(probCalc, 2)

print(pmiSimilarityDict)

{('professor', 'doctor'): 0.0, ('stock', 'egg'): 1.8174736272140593, ('baby', 'mother'): 3.1068514542000756, ('car', 'automobile'): 3.284928059255019, ('journey', 'voyage'): 0.0, ('coast', 'shore'): 4.630747773460183, ('brother', 'monk'): 2.8992677183777067, ('journey', 'car'): 0.0, ('coast', 'hill'): 1.2130606957673897, ('monk', 'slave'): 0.0, ('coast', 'forest'): 3.0505076829814297, ('psychology', 'doctor'): 3.5625762708186035, ('psychology', 'mind'): 2.7796743924855387, ('psychology', 'health'): 0.0, ('psychology', 'science'): 5.078497127110109, ('computer', 'laboratory'): 0.0, ('canyon', 'landscape'): 0.0, ('century', 'year'): 0.85521193298008, ('doctor', 'personnel'): 2.2186218696012423, ('school', 'center'): 0.744045575429721, ('word', 'similarity'): 0.0, ('hotel', 'reservation'): 2.891047211572738, ('type', 'kind'): 0.6500752376975433, ('equipment', 'maker'): 4.283313403192924, ('luxury', 'car'): 2.272328022475385, ('soap', 'opera'): 4.221195813265069, ('planet', 'people'): 0.40

Next, we will derive similarity scores using the LSA method, i.e. apply SVD and truncate to get a dense vector and then use cosine similarity between the two vectors for each word pair. We will be constructing a matrix where the (non-sparse) rows correspond to words in the vocabulary, and the (sparse) columns correspond to the texts where they appear. Again, we will use the Brown corpus, in the same format as with PMI and document frequency. After we have a matrix in the correct format, we will use truncatedSVD in Sci-kit learn to produce dense vectors of length 500, and then use cosine similarity to produce similarities for our word pairs.

In [4]:
# bag-of-words implementation
def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word.lower()] = BOW.get(word.lower(),0) + 1
    return BOW
    
# get frequency of words in paras using word types list created in Q1. 
# matrix of 0's and 1's depending on if the word exists in a paragraph or not
texts = []
for paras in brownParas:
    texts.append(get_BOW(paras))

# create words-paragraph frequency matrix
vectorizer = DictVectorizer()
brownMatrix = vectorizer.fit_transform(texts).transpose()

# get dense vectors of length 500 using truncated SVD
svd = TruncatedSVD(n_components=500)
brownMatrixSVD = svd.fit_transform(brownMatrix)

# create dictionary of wordpair/cosine-similarity mappings using LSA method for filtered word pairs
cosineSimilarityDict = {}
for word1, word2 in wordSimDict:
    word1Index = vectorizer.feature_names_.index(word1)
    word2Index = vectorizer.feature_names_.index(word2)
    cosSim = 1 - cos_distance(brownMatrixSVD[word1Index,:], brownMatrixSVD[word2Index,:])
    cosineSimilarityDict[word1, word2] = cosSim

print(cosineSimilarityDict)   

{('professor', 'doctor'): 0.1282666695038288, ('stock', 'egg'): 0.14018079535015326, ('baby', 'mother'): 0.33791842030742014, ('car', 'automobile'): 0.33016543021476574, ('journey', 'voyage'): 0.11860079401341417, ('coast', 'shore'): 0.40707475890629174, ('brother', 'monk'): 0.11141221551167657, ('journey', 'car'): -0.017884878217367062, ('coast', 'hill'): 0.1665997471401639, ('monk', 'slave'): -0.049648664986159385, ('coast', 'forest'): 0.1051220289732262, ('psychology', 'doctor'): 0.1220134726939699, ('psychology', 'mind'): 0.11359551564838555, ('psychology', 'health'): 0.014612476126077745, ('psychology', 'science'): 0.259081350444586, ('computer', 'laboratory'): 0.14034774071438472, ('canyon', 'landscape'): 0.10381126884703273, ('century', 'year'): 0.06959542605469127, ('doctor', 'personnel'): 0.06324958418247362, ('school', 'center'): 0.04368174614910669, ('word', 'similarity'): 0.004895650526370865, ('hotel', 'reservation'): 0.06454546381060666, ('type', 'kind'): 0.02563141155434

Next, we will derive a similarity score from word2vec vectors, using the Gensim interface. Check the Gensim word2vec tutorial for details on the API: https://radimrehurek.com/gensim/models/word2vec.html. Again, we will use the Brown for this, but for word2vec we don't need to worry about paragraphs and will train our model at the sentence level instead. Our vectors should have the same number of dimensions as LSA (500), and we will run for 50 iterations. This may take a while (several minutes). We will extract the similarites directly from the Gensim model.

In [5]:
# create dictionary of wordpair/word2vec-similarity mappings for filtered word pairs using sentences from brown corpus
brownSentences = nltk.corpus.brown.sents()
model = Word2Vec(brownSentences, min_count=5, size=500, iter=50)
word2vecSimilarityDict = {}
for word1, word2 in wordSimDict:
    word2vecSimilarityDict[word1, word2] = model.wv.similarity(word1, word2)

print(word2vecSimilarityDict)



{('professor', 'doctor'): 0.1029803229917908, ('stock', 'egg'): 0.1535156438485933, ('baby', 'mother'): 0.23415810120914327, ('car', 'automobile'): 0.17324409909345384, ('journey', 'voyage'): 0.4791619430328846, ('coast', 'shore'): 0.40639991421327903, ('brother', 'monk'): 0.04261563182237669, ('journey', 'car'): 0.19540108613671048, ('coast', 'hill'): 0.44374781062422103, ('monk', 'slave'): 0.009620027853437595, ('coast', 'forest'): 0.2879224341437675, ('psychology', 'doctor'): -0.027752614111583283, ('psychology', 'mind'): 0.05155980765556281, ('psychology', 'health'): 0.16711893567800892, ('psychology', 'science'): 0.31320292307617803, ('computer', 'laboratory'): 0.19272994938677682, ('canyon', 'landscape'): 0.16166988678899866, ('century', 'year'): 0.30780071132661774, ('doctor', 'personnel'): -0.05511129237329291, ('school', 'center'): -0.03064820064330967, ('word', 'similarity'): 0.039037391290843, ('hotel', 'reservation'): 0.05672515585654653, ('type', 'kind'): 0.261734319327021

Finally, we will compare all the similarities we've created to the gold standard we loaded and filtered in the first step. For this, we will use the Pearson correlation co-efficient (`pearsonr`), which is included in scipy (`scipy.stats`). The data for the two datasets needs to be in the same order for correct comparison using correlation. We will write a general function and then apply it to each of the similarity score dictionaries.

In [6]:
# compare similarities with the gold standard using pearson correlation co-efficient
wordSimGoldStanardList = list(wordSimDict.values())
def pearsonCorrelation(wordSimilarityDict):
    wordSimilarityList = list(wordSimilarityDict.values())
    return pearsonr(wordSimGoldStanardList, wordSimilarityList)[0]

print('Pearson correlation coefficient compared with Gold Standard:')
print('------------------------------------------------------------')
print('Cosine: ', pearsonCorrelation(cosineSimilarityDict))
print('PMI: ', pearsonCorrelation(pmiSimilarityDict))
print('Wu-Palmer: ', pearsonCorrelation(wuPalmerSimilarityDict))
print('Word2Vec: ', pearsonCorrelation(word2vecSimilarityDict))

Pearson correlation coefficient compared with Gold Standard:
------------------------------------------------------------
Cosine:  0.36048303868834
PMI:  0.21406331437316534
Wu-Palmer:  0.45669274063664006
Word2Vec:  0.3151240968418435


## A final word

Normally, we would not use a corpus as small as the Brown for the purposes of building distributional word vectors. Also, note that filtering our test set to just words we are likely to do well on would typically be considered cheating.