In [None]:
import numpy as np
import pickle
import torch

# Get the interactive Tools for Matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [9.5, 6]

In [None]:
class Vocabulary(object):
    def __init__(self, pad_token='<pad>', unk_token='<unk>', eos_token='<eos>'):
        self.token2idx = {}
        self.idx2token = []
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.eos_token = eos_token
        if pad_token is not None:
            self.pad_index = self.add_token(pad_token)
        if unk_token is not None:
            self.unk_index = self.add_token(unk_token)
        if eos_token is not None:
            self.eos_index = self.add_token(eos_token)

    def add_token(self, token):
        if token not in self.token2idx:
            self.idx2token.append(token)
            self.token2idx[token] = len(self.idx2token) - 1
        return self.token2idx[token]

    def get_index(self, token):
        if isinstance(token, str):
            return self.token2idx.get(token, self.unk_index)
        else:
            return [self.token2idx.get(t, self.unk_index) for t in token]

    def __len__(self):
        return len(self.idx2token)

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.__dict__, f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            self.__dict__.update(pickle.load(f))

We will use the vocabulary computed by the CBOW Preprocessing notebook (cbow-preprocessing), and the word vectors computed by the CBOW Training notebook (cbow-vectors)

In [None]:
DATASET_VERSION = 'ca-100'
CBOW_VOCABULARY_ROOT = f'../input/cbow-preprocessing/data/{DATASET_VERSION}'
CBOW_VECTORS_ROOT = f'../input/cbow-training-v-1/data/{DATASET_VERSION}'

In [None]:
dict = f'{CBOW_VOCABULARY_ROOT}/ca.wiki.train.tokens.nopunct.dic'
counter = pickle.load(open(dict, 'rb'))
words, values = zip(*counter.most_common(5000))
print('Most frequent Catalan words')
print(words[:10])
print(values[:10])

**Zipf's law of words**. Zipf's law was originally formulated in terms of quantitative linguistics, stating that given some corpus of natural language utterances, the frequency of any word is inversely proportional to its rank in the frequency table.

In [None]:
_ = plt.plot(values[:50], 'g', 2*values[0]/np.arange(2,52), 'r')

In [None]:
_ = plt.loglog(values)
plt.show()

**Benford's law**, also called the Newcomb–Benford law, the law of anomalous numbers, or the first-digit law, is an observation about the frequency distribution of leading digits in many real-life sets of numerical data.

In [None]:
from collections import Counter
benford = Counter(int(str(item[1])[0]) for item in counter.most_common(5000))
print(benford)
percentage = np.array(list(benford.values()), dtype=np.float)
percentage /= percentage.sum()
_ = plt.bar(list(benford.keys()), percentage*100)

In [None]:
modelname = f'{CBOW_VECTORS_ROOT}/{DATASET_VERSION}_c.pt'
state_dict = torch.load(modelname, map_location=torch.device('cpu'))

In [None]:
state_dict.keys()

In [None]:
input_word_vectors = state_dict['emb.weight'].numpy()
output_word_vectors = state_dict['lin.weight'].numpy()

In [None]:
token_vocab = Vocabulary()
token_vocab.load(f'{CBOW_VOCABULARY_ROOT}/ca.wiki.vocab')

In [None]:
class WordVectors:
    def __init__(self, vectors, vocabulary):
        """
        """ 
        self.vocabulary = vocabulary
        self.vectors = vectors
    
    def most_similar(self, word, topn=10):
        """
        """
        tokens = list()
        similarities = list()
        
        if type(word) == str:
            word_embedding = self.get_word_embedding(word)       
        else:
            word_embedding = word
            
        for i, token in enumerate(self.vocabulary.token2idx):
            
            token_embedding = self.get_word_embedding(token)
            similarity = ( np.dot(word_embedding, token_embedding) /
                           (np.linalg.norm(word_embedding)*np.linalg.norm(token_embedding)) )

            if i < topn:
                tokens.append(token)
                similarities.append(similarity)
            
            elif similarity > min(similarities):
                replace_idx = similarities.index(min(similarities))
                tokens[replace_idx] = token
                similarities[replace_idx] = similarity 
    
        return sorted(list(zip(tokens, similarities)), key=lambda x: -x[1])
            
    
    
    def analogy(self, x1, x2, y1, topn=5, keep_all=False):
        """
        """
        x1_emb = self.get_word_embedding(x1)
        x2_emb = self.get_word_embedding(x2)
        y1_emb = self.get_word_embedding(y1)
        analogy_emb = y1_emb + (x2_emb - x1_emb)
        
        analogies = self.most_similar(analogy_emb, topn+3)
        if not keep_all:
            analogies = [(k, v) for k,v in analogies if k not in (x1, x2, y1)]
        return analogies[:topn]
    
    
    def get_word_embedding(self, word):
        """
        """
        word_idx = self.vocabulary.token2idx[word]
        return self.vectors[word_idx]
    

In [None]:
model1 = WordVectors(input_word_vectors, token_vocab)
model2 = WordVectors(output_word_vectors, token_vocab)

In [None]:
[x[0] for x in model1.most_similar('Joan', topn=6)]

In [None]:
model1.analogy('França', 'francès', 'Polònia', topn=)

In [None]:
model1.most_similar('Joan')

In [None]:
model1.most_similar('lleidatà',100)

In [None]:
model2.most_similar('feminisme', 20)

In [None]:
model1.most_similar('justícia')