# Vectorisation meets words
Examples of how word-vectors (word-embeddings) can be used to all sorts of things!

// By MGM

In [1]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

### Method for finding the Cosine Similarity
Cosine between two vectors.

Similarity comes between 1 and -1.


(1 if two words are very similar. 
-1 if two words are very dissimilar.)

In [2]:
def cosSim(vector1, vector2):
    
    cos_sim = dot(vector1,vector2)/(norm(vector1)*norm(vector2))
    
    return cos_sim

In [3]:
# Example
x1 = [1,2,3]
x2 = [1,2,5]
cosineSim = cosSim(x1,x2)

In [4]:
print("Cosine similarity:")
print(cosineSim)

Cosine similarity:
0.9759000729485332


## Word vectorisation
Load pre-trained word embedding. 

I use Danish Word2Vec Continuous Skipgram, with each word being represented by a vector of size 100. 

Pre-trained word embeddings can be found here: http://vectors.nlpl.eu/repository/

In [5]:
# Load pre-trained word embeddings
embeddings_index = {}
with open('word_Embedding_DK.txt','rb') as f:
    for line in f:
        try:
            values = line.split()
            word = values[0].decode('utf-8')
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        except UnicodeDecodeError: 
            next(f)
    f.close()
print('Found %s word vectors.' % len(embeddings_index))

vocabSize = len(embeddings_index)

Found 1655836 word vectors.


## Word vectors
Get a vector of a given word

In [6]:
def wordToVec(word):
    wordVector = embeddings_index[word.lower()]
    return wordVector

In [7]:
wordVector = wordToVec("rødgrød")
print(wordVector)

[ 0.090582 -0.002726 -0.233673 -0.242148  0.039417 -0.03588  -0.193565
  0.181421 -0.281298 -0.002239 -0.054147  0.770873 -1.02673  -0.196059
  0.204133 -0.082985 -0.183909  0.185923 -0.243544  0.414395 -0.23342
  0.558042 -0.388158 -0.141617 -0.714511 -0.353422  0.127615  0.474835
  0.308668  1.072249 -0.237765  0.263269 -0.419263  0.325531 -0.677848
  0.201058 -0.278782  0.230346  0.110748 -0.238589  0.314232  0.045913
 -0.557342 -0.336894 -0.519073  0.659179 -0.100333 -0.355933  0.147805
  0.139832 -0.349028 -0.104093 -0.23965  -0.105693 -0.041038  0.052655
 -0.156822 -0.060792 -0.679549 -0.72804   0.269587  0.523499  0.021881
  0.063605  0.346054  0.390464  0.022117 -0.566829  0.696968 -0.01182
 -0.900374 -0.660137 -0.150423 -0.058631 -0.381839  0.332458 -0.251802
 -0.587417 -0.673894 -0.576689  0.127594  0.695196 -0.433831 -0.739833
 -0.174181 -0.116684 -0.189647  0.229381  0.987568 -0.462757 -0.172011
  0.041496 -0.127187  0.106523  0.444665  0.05065   0.552203  0.047396
  0.4476

## Compare two words (vectors)

In [8]:
wordVector1 = wordToVec("måne")
wordVector2 = wordToVec("menneske")
similarity = cosSim(wordVector1, wordVector2)
print(similarity)

0.40724793


## Find most similar word (vector)

In [9]:
def findMostSim(wordInput):
    
    wordInputVector = wordToVec(wordInput)
    
    similarity = 0
    mostSimWord = "Nan"
    
    for word in embeddings_index:
        
        wordVectorDic = wordToVec(word)        
        
        if ((cosSim(wordInputVector,wordVectorDic) > similarity) & (wordInputVector != wordVectorDic).all()):
            similarity = cosSim(wordInputVector,wordVectorDic)
            mostSimWord = word
    
    return mostSimWord, similarity  

In [10]:
# Might take a couple of minutes
sim = findMostSim("smuk")
print(sim)

('vidunderlig', 0.79009515)


## Find top k most similar word
Method for finding the top k most similar word

In [11]:
def findTopKMostSim(wordInput,k):
    
    wordInputVector = wordToVec(wordInput)
    
    similarity = 0
    topKWords = []
    
    for word in embeddings_index:
        
        # Get vector of a given word
        wordVectorDic = wordToVec(word)
        # Output is a tuple with (word, similarity)
        similarity = cosSim(wordInputVector,wordVectorDic)
        
        # If topKWords is not full, append words to it until it becomes full.
        if (len(topKWords) < k):
            topKWords.append((word,similarity))
        
        # Otherwise start comparing similarity and switch/change most sim. words
        elif len(topKWords) == k:
            
            # Keep track of most dissimilar word. 
            minSimWord = 1
            for x,y in topKWords:
                if y < minSimWord:
                    minSimWord = y
            
            # If word is more similar than most dissimilar word in topKwords, swap the words.
            if similarity > minSimWord:
                
                indexMinSimWord = [x[1] for x in topKWords].index(minSimWord)
                del topKWords[indexMinSimWord]
                topKWords.append((word, similarity))
        
        else:
            print("Something is wrong")
    
    return topKWords  

In [12]:
mostSimWords = findTopKMostSim("computer",15)
for word in mostSimWords:
    print(word)

('computer', 1.0)
('pc', 0.855114)
('computeren', 0.8046989)
('pc.', 0.8696126)
("pc'en", 0.81041807)
('mobilenhed', 0.7986685)
('windows-computer', 0.8331629)
('internet-browser', 0.7977846)
('usb-forbindelse', 0.79685915)
('mediaplayer', 0.7961525)
('medieserver', 0.7973152)
('center-extender', 0.7960618)
('apple-enhed', 0.81449527)
('windows-pc.', 0.7993573)
('10-enhed', 0.79847455)
