In [8]:
import sys
if "../" not in sys.path: sys.path.insert(0,"../");

## Data handling thingy

In [23]:
import torch; torch.__version__

'1.9.0+cpu'

In [24]:
!which python

/home/iam-kevin/Projects/ml/packages/mary/venv/bin/python


## Structures

In [11]:
class token(str):
    def __init__(self, inp_: str, *args, **kwargs):
        super().__init__()
        self.o = inp_
        
    def __repr__(self):
        return f"t'{self.o}'"
    
class word(token):
    def __repr__(self):
        return f"w'{self.o}'"
    
class morph(token):
    def __repr__(self):
        return f"m'{self.o}'"

In [12]:
token("something"), word("lo"), morph('lo')

(t'something', w'lo', m'lo')

`Vocab` section. Building the vocabulary structure 

In [13]:
# from collections import OrderedDict, defaultdict
from typing import List, Optional, Union, Iterable

# Vocabulary special for words
# -------------------
class Vocab(object):
    def __init__(self, tokens: Iterable[token]):
        self._tokens = list(tokens)
        
        # MAYBE: this might be a problem
        self._tokens.sort()

    def has(self, token: token):
        return token in self._tokens
    
    def get_tokens(self):
        return self._tokens

    def __iter__(self):
        return iter(self._tokens)
    
    def __len__(self):
        return len(self._tokens)
    
    def extra_repr(self):
        if len(self) > 4:
            return ", ".join(self._tokens[:2]) + "..., " + self._tokens[-1]
        
        return ", ".join(self._token)
    
    def __repr__(self):
        return "Vocab([%s], len=%d)" % (self.extra_repr(), len(self))
    
    @property
    def size(self):
        return len(self)
    
    @classmethod
    def from_file(cls, file_path):
        pass

In [14]:
from typing import List, Optional, Union, Any
from collections import defaultdict

class Encoder(object):
    def __init__(self, items: Iterable[str]):
        items = tuple(items)
        self._encode_map = dict(zip(items, range(len(items))))
        self._decode_map = { v: k for k, v in self._encode_map.items()}
    
    def encode(self, item: str):
        if item not in self._encode_map:
            raise KeyError("Item '%s' is not in items" % item)
            
        return self._encode_map[item]
    
    def decode(self, ix: int):
        if ix >= len(self):
            raise ValueError("Index %d is missing" % (ix))

        return self._decode_map[ix]
        
    def __len__(self):
        return len(self._encode_map)
    
OOV_TOKEN = token('<UNK>')

class TokenEncoder(Encoder):
    def __init__(self, vocab: Vocab):
        super().__init__(vocab.get_tokens())
#         self.oov = {} if oov is None else {it:len(self.items)+i for i,it in enumerate(oov) if it not in self.items}
#         self.ix = len(self.oov)
#         self.encode_unk = encode_unk
    def encode(self, item: token):# , enc_unk=True):
        try:
            return super().encode(item)
        except KeyError:
            return -1
#         item = item[0] if isinstance(item, list) else item

#         if not enc_unk or item in self.items:
#             return int(self.items.index(item)) if item in self.items else int(len(self.items)) 

#         if item in self.oov.keys():
#             return self.oov[item]
        
#         if self.encode_unk:
#             self.oov.update({item:int(len(self.items))+self.ix})
#             self.ix+=1

#             return self.oov[item]
        
#         return int(len(self.items))

#     def encode_oov(self, oov):
#         self.oov.update({it:len(self.items)+i for i,it in enumerate(oov) if it not in self.items})


    def decode(self, ix):
        try:
            return super().decode(ix)
        except ValueError:
            return OOV_TOKEN
#         if ix<len(self.items):
#             return self.items[int(ix)]
        
#         return [(k,v) for k,v in self.oov.items() if v==ix][0][0]

## Data (Pytorch)

In [15]:
import numpy as np

from torch.utils.data import Dataset
from typing import List, Iterable

class MorphemeDataset(Dataset):
    def __init__(self, word_morph_list: Iterable[List[str]]):
        self.wml = list(word_morph_list)
        
    def __getitem__(self, index: int):
        return np.array(self.wml[index])
    
    def __len__(self):
        return len(self.wml)

## Modeling

#### Embeddings

In [16]:
import numpy as np

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence


class SEDWordEmbeddingLayer(nn.Module):
    def __init__(self, 
                 token_vocab_size: int, 
                 embedding_dim: int, 
                 hidden_dim: int, 
                 dropout: int = 0.4, 
                 num_attn_layers=None, 
                 d_ff=None, 
                 hidden=None, 
                 out_c=None,
                 kernel=None,
                 padding_idx: int = 0,
                 composition_fn='rnn'):
        super(SEDWordEmbeddingLayer, self).__init__()
        
        self.comp_fn = composition_fn
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.emb_mod = nn.Embedding(token_vocab_size, embedding_dim, padding_idx=padding_idx)
        self.compose, self.comp_linear = self.select_comp(dropout, num_attn_layers, d_ff, hidden, out_c, kernel)
    
    def select_comp(self, dropout, num_attn_layers, d_ff, hidden, out_c, kernel):
        """Helper to select composition function"""
        compose = None
        comp_linear = None
        
        if self.comp_fn == 'rnn':
            compose = nn.GRU(input_size=self.embedding_dim, hidden_size=self.hidden_dim, num_layers=1, batch_first=True, bidirectional=True)
            comp_linear = nn.Sequential(
                nn.Linear(self.hidden_dim*2, self.embedding_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            )
            
        return compose, comp_linear

    def padded_sum(self, data, input_len = None, dim=0):
        """
        summing over padded sequence data
        Args:
            data: of dim (batch, seq_len, hidden_size)
            input_lens: Optional long tensor of dim (batch,) that represents the
                original lengths without padding. Tokens past these lengths will not
                be included in the sum.

        Returns:
            Tensor (batch, hidden_size)

        """
        if input_len is not None:
            return torch.stack([
                torch.sum(data[:, :input_len, :], dim=dim)
            ])
        else:
            return torch.stack([torch.sum(data, dim=dim)])

    def rnn_compose(self, emb_in):
        """
        RNN composition of morpheme vectors into word embeddings
        """
                           
        return self.compose(emb_in)[0]
    
    def get_composition(self, emb_in, in_len, dim):
        """
        Helper function to get word embeddings from morpheme vectors. Uses additive function by default
        if composition function is not specified
        """
        if self.comp_fn == 'rnn':
            if len(in_len)>1 or (len(in_len)==1 and in_len[0] is not None):

                emb_in = pack_padded_sequence(emb_in, in_len, batch_first=True, enforce_sorted=False)
                rnn_out,_ = pad_packed_sequence(self.rnn_compose(emb_in), batch_first=True)

            else:
                rnn_out = self.rnn_compose(emb_in)[0].unsqueeze(0)

            # if self.return_array:

            #     return torch.mean(self.comp_linear(rnn_out),1).clone().detach().cpu().numpy()

            return torch.mean(self.comp_linear(rnn_out),1).squeeze(0).clone().detach().cpu()

        return self.padded_sum(emb_in, in_len, dim=dim)
    
    
    def check_batched(self, inputs):
        """
        check whether data is passed in batches(for models like the language and sentiment analysis)
        or as single inputs and get embeddings accordingly.

        Args:
            inputs - collection containing the label-encoded words as well as their corresponding original
                     lengths if were padded

        Returns:
            tensor of the vector representation of the input sequence
        """
        emb_in = [self.forward(x_in, in_len) for x_in, in_len in zip(inputs[0], inputs[1])]

        if self.comp_fn is not None:
            emb_len = [torch.as_tensor(len(emb)) for emb in emb_in]
            pad_in = pad_sequence(emb_in, batch_first=True)
            packed_out = pack_padded_sequence(pad_in, emb_len, batch_first=True, enforce_sorted=False)
            
            return packed_out
        
        return torch.cat(emb_in, dim=0)

    def forward(self, x_in, in_len, dim=1):
        """
        Get embeddings from morpheme vectors after passing label-encoded vectors through the embedding layer
        Args:
            x_in   - label-encoded vector inputs
            in_len - original lengths of vectors if were padded, else is None

        Returns:
            vector representation (embeddings) of the text sequence 
        """  
#         x_in, in_len = self.check_input(x_in, in_len)
        emb_in = torch.cat([torch.stack([self.emb_mod(x)]) for x in x_in])
        
        return self.get_composition(emb_in, in_len, dim)

# Handy functions
# ---------------------------------------


def morpheme_pad_collate(batch):
    x, x_len = [], []
    
    x_len = [m.shape[0] for m in batch]
    max_len = max(x_len)
    
    x_batch_ = [np.pad(t, (0, max_len - t.shape[0])) if t.shape[0] < max_len else t for t in batch]
    xx = torch.stack([torch.from_numpy(x.reshape(1,-1)).long() for x in x_batch_])

    return xx, torch.as_tensor(x_len).unsqueeze(dim=1)

def build(token_vocab_size, embedding_dim, hidden_dim):
    pass

# ---------------------------------------


class SEDWordEmbeddings(nn.Module):
    def __init__(self, 
                 morpheme_vocab_size: int, 
                 embedding_dim: int, 
                 hidden_dim: int,
                 use_cuda: bool = True
                ):
        super(SEDWordEmbeddings, self).__init__()
        self.swe_layer = SEDWordEmbeddingLayer(morpheme_vocab_size, embedding_dim, hidden_dim)
        self.swe_layer.eval() # freeze
        
        self.embedding_dim = embedding_dim
        
        # device to train
        self.device = torch.device("cuda:0" if use_cuda else "cpu")
        
    def embed(self, t_morphs_of_word: torch.Tensor):
        return self.swe_layer(t_morphs_of_word, [None])

    def embed_morphemes(self, morphemes_word_list: Iterable[List[int]]) -> torch.Tensor: 
#         str_enc = torch.as_tensor([0])
        
#         tok, tok_len, str_enc = lm_pad_collate([(morpheme_indices, str_enc[0])])
#         tok, tok_len = lm_pad_collate([morphemes_word_list])
        
        tok, tok_len = morphemes_word_list
       
        return pad_packed_sequence(
            self.swe_layer.check_batched(
                (tok, tok_len)
            ), batch_first=True)[0].squeeze().clone().detach().cpu().numpy()
    
    def fit(self, dataset: MorphemeDataset, batch_size: int = 256):
        dl = DataLoader(dataset, batch_size=batch_size, collate_fn=morpheme_pad_collate)
        
        _tok_emb = [self.embed_morphemes(d) for d in tqdm(dl)]
        
        # returns the embeddings
        embeddings = np.concatenate(_tok_emb)
        return embeddings

#         self.embeddings = embeddings
#         self.compose_embeddings.eval()

#         if emb_store_path is not None or weight_store_path is not None:
#             self.save_embeddings(weight_store_path, emb_store_path)
            

#     def save_to_path(self, embed_path: str):
#         pass
    
#     @classmethod
#     def load_from(self, embed_path: str):
#         pass


#### Models

In [17]:
from typing import Tuple
class SEDLookupLayer(nn.Module):
    def __init__(self, word_count: int, embedding_dim: int, comp_fn: str = None, rnn_dim: int = 32):
        super(SEDLookupLayer, self).__init__()
        self.comp_fn = comp_fn
        self.birnn = nn.GRU(
            input_size=embedding_dim, 
            hidden_size=rnn_dim, 
            num_layers=1, 
            batch_first=True,
            bidirectional=True
        )
        self.linear = nn.Sequential(
            nn.Linear(rnn_dim*2, embedding_dim), 
            nn.ReLU(),
            nn.Dropout(0.2),
            )
        self.classifier = nn.Linear(embedding_dim, word_count)

    def forward(self, emb_in):    
#         output, _ = self.birnn(emb_in) if self.comp_fn is None else pad_packed_sequence(self.birnn(emb_in)[0], batch_first=True)
        output, _ = self.birnn(emb_in)
        output = self.linear(torch.mean(output, 1))  
        
        return self.classifier(output)
        
class SEDLanguageModel(nn.Module):
    def __init__(self, 
                 sed_word_embeddings_layer: SEDWordEmbeddingLayer,
                 word_count: int, 
                 rnn_dim: int = 32):
        super(SEDLanguageModel, self).__init__()
        self.swe_layer = sed_word_embeddings_layer
        self.look_up = SEDLookupLayer(word_count, self.swe_layer.embedding_dim, self.swe_layer.comp_fn, rnn_dim)
#         self.birnn = nn.GRU(
#             input_size=self.swe.embedding_dim, 
#             hidden_size=rnn_dim, 
#             num_layers=1, 
#             batch_first=True,
#             bidirectional=True
#         )
#         self.linear = nn.Sequential(
#             nn.Linear(rnn_dim*2, self.swe.embedding_dim), 
#             nn.ReLU(),
#             nn.Dropout(0.2),
#             )
#         self.classifier = nn.Linear(self.swe.embedding_dim, word_count)

    def forward(self, inputs: List[Tuple[int]], input_len: List[int]):
        emb_in = self.swe_layer(inputs, input_len)
        
        return self.look_up(emb_in.unsqueeze(dim=0))

#         output,_ = self.birnn(emb_in) if self.swe.comp_fn is None else pad_packed_sequence(self.birnn(emb_in)[0], batch_first=True)
#         output = self.linear(torch.mean(output, 1))  
        
#         return self.classifier(output)

### Playground

## Building it all together

...

In [18]:
# Initializing data source

from typing import List
from pathlib import Path

# Data
resources_folder = Path("../resources")
sample_file = resources_folder.joinpath("./train.txt")

In [20]:
from experimental.sed import MorphologyAnalyzer
# from marynlp import funcutils as f
from typing import List, Tuple

class WordBreaker(object):
    def __init__(self, ma: MorphologyAnalyzer):
        self.ma_ = ma
        
    def break_word(self, word: word) -> Tuple[morph]:
        return tuple([ morph(su) for su in ma.break_text([word])[word]])
        
morpheme_path = resources_folder.joinpath("./build/df_morphs.txt")
ma = MorphologyAnalyzer(morph_path=morpheme_path)

wb = WordBreaker(ma)

# Sanity check
wb.break_word("vita")

FileNotFoundError: [Errno 2] No such file or directory: '../resources/build/df_morphs.txt'

In [21]:
from marynlp import funcutils as f
from tqdm.notebook import tqdm

from typing import List, Union
import os
import re

def split_by_space(text: str) -> List[str]:
    """Split a text to word strings
    
    Example
    "Lorem ipsum" -> [ 'Lorem', 'ipsum' ] 
    """
    return re.split(r"\s+", text)

def read_file(file_path) -> List[str]:
    with open(file_path, "r", encoding="utf-8") as rb:
        return rb.readlines()

@f.forEach(word, type_=set)
@f.filterBy(lambda s: len(s.strip()) > 0)
@f.flowBy(split_by_space)
@f.forEach(lambda s: s.strip())
def get_unique_words_from_file(file_path: os.PathLike):
    return tqdm(read_file(file_path))

unique_words = get_unique_words_from_file(sample_file)

FileNotFoundError: [Errno 2] No such file or directory: '../resources/train.txt'

In [22]:
from typing import Iterable
from tqdm.notebook import tqdm

def get_unique_morphemes(unique_words: Iterable[word], wb: WordBreaker) -> List[List[morph]]:
    return set([m for morp in [wb.break_word(word) for word in tqdm(unique_words)] for m in morp])

unique_morphemes = get_unique_morphemes(unique_words, wb)

NameError: name 'unique_words' is not defined

In [21]:
# Create vocabularies for words and morphemes
# -------

word_vocab = Vocab(unique_words)
morpheme_vocab = Vocab(unique_morphemes)

In [22]:
# Sanity check
word_vocab.has("alikuja"), word_vocab.has("asipokuja")

(True, False)

In [23]:
word_encoder = TokenEncoder(word_vocab)
morpheme_encoder = TokenEncoder(morpheme_vocab)

In [24]:
# Sanity check
word_encoder.encode('alikuja'), word_encoder.encode('asipokuja')

(974, -1)

In [25]:
# SAnity check
morpheme_encoder.encode('ali'), morpheme_encoder.encode('zzz')

(157, -1)

In [26]:
from typing import Iterable
from marynlp import funcutils as f

# @f.apply(MorphemeDataset)
def get_dataset_from_words(unique_words: Iterable[word], wb: WordBreaker) -> MorphemeDataset:
    return ([ wb.break_word(word) for word in tqdm(unique_words)])

morphs_word_list = get_dataset_from_words(word_vocab.get_tokens(), wb)

  0%|          | 0/28244 [00:00<?, ?it/s]

In [27]:
from torch.utils.data import DataLoader
md = MorphemeDataset([tuple([morpheme_encoder.encode(m) for m in morph]) for morph in morphs_word_list ])
dl = DataLoader(md, batch_size=256, collate_fn=morpheme_pad_collate)

# index = 6

for index in [6, 19, 8]:
    t = md[index]
    word= morphs_word_list[index]
    print(word, t, t.shape)
# max_len = 5
# np.pad(t, (0, max_len-t.shape[0]))

(m'aa', m'mbi', m'wa') [   4 2608 4761] (3,)
(m'a', m'b', m'b', m'a', m's') [   3  387  387    3 3883] (5,)
(m'a', m'ae') [ 3 47] (2,)


In [154]:
look_up = SEDLookupLayer(word_vocab.size, embedding_dim=100, comp_fn=swe.swe_layer.comp_fn)

In [36]:
def lm_pad_collate(batch):
    x, x_len = [], []
    
    x_len = [m.shape[0] for m in batch]
    max_len = max(x_len)
    
    x_batch_ = [np.pad(t, (0, max_len - t.shape[0])) if t.shape[0] < max_len else t for t in batch]
    xx = torch.stack([torch.from_numpy(x).long() for x in x_batch_])

    return xx, torch.as_tensor(x_len)

In [58]:
swe = SEDWordEmbeddings(
    morpheme_vocab.size,
    embedding_dim=100,
    hidden_dim=32
)
swl = SEDLanguageModel(swe.swe_layer, word_count=word_vocab.size)

In [59]:
# Sanity fitting
# --------------------------
# embeddings = swe.fit(md)
# embeddings.shape

In [63]:
from marynlp import funcutils as f
import numpy as np

# deal with word
sentence = "walimu wanasema tusome"

fn_get_tokenize = f.forEach(morpheme_encoder.encode)(wb.break_word)

@f.forEach(fn_get_tokenize)
def split_by_space(text: str):
    return text.split(" ")

out = wb.break_word("ataenda")
array_split_by_space = f.forEach(np.array)(split_by_space)

x, xl = lm_pad_collate(array_split_by_space(sentence))
x, xl

cc = swl(x, xl)

# emb_in = swe.swe_layer(x, xl)
# o, _ = look_up.birnn(emb_in.unsqueeze(dim=0))
# lo = look_up.linear(torch.mean(o, 1))


# cc = look_up.classifier(lo)
print(cc)
am = torch.argmax(cc)
word_encoder.decode(am.item())



tensor([[-0.1330,  0.0159, -0.0515,  ...,  0.0107,  0.0908, -0.0821]],
       grad_fn=<AddmmBackward>)


w'wanaoomba'

In [71]:
import re
from typing import Iterable, List
from collections.abc import Callable

class LMDataset(Dataset):
    def __init__(self, 
                 list_of_sentences: List[str],
                 word_tokenize: Callable):
        self.ls = list_of_sentences
        self.word_tokenize = word_tokenize

    def split_words(self, sentence: str) -> Iterable[str]:
        return re.split(r"\s+", sentence)

    def __len__(self):
        return len(self.ls)
    
    def __getitem__(self, index: int):
        return [self.word_tokenize(w) for w in self.split_words(self.ls[index])]

    
fn_get_tokenize = f.forEach(morpheme_encoder.encode)(wb.break_word)
lm_dataset = LMDataset(["walimu wanasema tusome", "njoo shule uimbe"], fn_get_tokenize)
lm_dataset[1]

train_dataloader = DataLoader(lm_dataset, batch_size=3, collate_fn=)

[[3158, 3345], [1354, 2313], [4497, -1]]

In [None]:
# set to train state
swl.train()
        
start_time = time()
data_len = len(self.train_loader.dataset)

for batch_idx, _data in enumerate(self.train_loader):
    x, x_len, y = _data 

    y = [y_.to(ModelsConfig.device) for y_ in [y]]
    y = [y_.reshape(y_.shape[0],) for y_ in y]

    self.optimizer.zero_grad()

    output = self.model((x, x_len))
    loss = self.criterion(output, y[0])
    loss.backward()

    self.optimizer.step()
    if batch_idx % verbose == 0 or batch_idx == data_len:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tPerplexity: {:5.2f}\telapsed: {:.2f} mins'.format(
            epoch, batch_idx * len(x), data_len,
            100. * batch_idx / len(self.train_loader), loss.item(), np.exp(loss.item()), (time()-start_time)/60))#, loss2.item()))\tap_Loss: {:.6f}
                

## Testing for word relations

In [164]:
from sklearn.metrics.pairwise import cosine_similarity

class WordRelations(object):
    def __init__(self, 
                 swe: SEDWordEmbeddings, 
                 embeddings: np.ndarray,
                 word_encoder: TokenEncoder,
                 morpheme_encoder: TokenEncoder,
                 word_breaker: WordBreaker
                ):
        self.swe = swe
        self.word_encoder = word_encoder
        self.morpheme_encoder = morpheme_encoder
        self.wb = word_breaker
        self.embeddings = embeddings
        
    def get_word_embedding(self, word: str) -> torch.Tensor:
        morphs_of_word = [ self.morpheme_encoder.encode(m) for m in self.wb.break_word(word)]
        tmw = torch.tensor([morphs_of_word])

        return self.swe.embed(tmw)

    def get_most_similar(self, string: str, sim_dict: int, threshold: int):
        """
        get most similar word(s) from collection of related words using the cosine similarity measure

        Args:
            string    - string whose most siilar words are to be obtained
            sim_dict  - dictionary of similar words
            threshold - minimum cosine similarity value for words to be considered most similar to string
                        if None then only word with highest cosine similarity is returned
        
        Returns:
            collection of most similar words as determined by their cosine similarity to the string being considered
        """

        cos_sim = [sim[1] for sim in sim_dict[string]]
        max_sim = max(cos_sim)

        if threshold is not None:
            assert max_sim>=threshold, 'threshold set too high, no similar words found'

            return [v for v in sim_dict[string] if v[1]>=threshold]

        return [v for v in sim_dict[string] if v[1]==max_sim]

    def get_similar_words(self, string, k_dim=0, threshold=None):
        """
        get collection of closely related words usnig the cosine similarity of their embedding vectors

        Args:
            string          - string whose related words are to be obtained
            embeddings_dict - dictionary of word embeddings. If embedder already trained uses existing embeddings.
            threshold       - minimum cosine similarity for word to be considered similar to given word

        Returns:
            dictionary of similar words and their similarity as measure by the cosine similarity between their embedding vectors
            and that of the string
        """
#         self.check_embeddings()
        val = self.get_word_embedding(string)
        
        sim_dict = {}
        sim_dict[string] = [(txt, cosine_similarity(val.reshape(1,-1), vec.reshape(1,-1)).reshape(1)[0]) for txt,vec in enumerate(self.embeddings) if txt!=string or not (vec==val).all()]
        
        most_similar = self.get_most_similar(string, sim_dict, threshold)
        sim_dict[string] = sorted(most_similar,key=lambda x: x[1], reverse=True)

        return sim_dict

    def get_best_analogy(self, sim_list, string_b, return_cos_similarity):
        """
        get most relevant analogy from collection of analogous words. uses cosine similarity measure to determine 
        the best analogy

        Args:
            sim_list - list of words similar to the given word
            string_b - word whose analogy is to be determined
            return_cosine_similarity - whether or not output should include the analogy's cosine similarity

        Returns:
            analogy of the given word
        """
        sorted_sim = sorted([sim for sim in sim_list if sim[1]>0], key=lambda x:x[1], reverse=True)
        max_sim = sorted([sim for sim in sim_list if sim[1]>0], key=lambda x:x[1], reverse=True)[0][0]
        
        if not return_cos_similarity:
            sorted_sim = [sim[0] for sim in sorted_sim]

        if max_sim == self.word_encoder.encode(string_b):
            return self.word_encoder.decode(sorted_sim[1])
        
        return self.word_encoder.decode(sorted_sim[0])

    def _3_cos_add(self, a, _a, b, string_b, k_dim, return_cos_similarity):
        """
        determine the analogy of the given word based on an additive function of cosine similarities

        Args:
            a,_a     - vector representation of the example of a word and its corresponding analogy
            b        - vecor representation of the string whose analogy is to be determined
            string_b - string whose analogy is to be determined

        Returns:
            analogy of the string based on given example and determined using cosine similarity
        """
        _b = b - a + _a

        sim_list = [(txt, cosine_similarity(vec.reshape(1,-1),_b).reshape(1)[0]) for txt,vec in enumerate(self.embeddings)]
  
        return self.get_best_analogy(sim_list, string_b, return_cos_similarity)

    def _3_cos_mul(self, a, _a, b, string_b, k_dim, return_cos_similarity, eps=0.001):
        """
        determine the analogy of the given word based on a multiplicative function of cosine similarities

        Args:
            a,_a     - vector representation of the example of a word and its corresponding analogy
            b        - vecor representation of the string whose analogy is to be determined
            string_b - string whose analogy is to be determined

        Returns:
            analogy of the string based on given example and determined using cosine similarity
        """
        
        sim_list = [(txt, (cosine_similarity(vec.reshape(1,-1),b).reshape(1)[0]*cosine_similarity(vec.reshape(1,-1),_a).reshape(1)[0])/(cosine_similarity(vec.reshape(1,-1),a).reshape(1)[0]+eps))\
                    for txt,vec in enumerate(self.embeddings)]
        return self.get_best_analogy(sim_list, string_b, return_cos_similarity)

    def pair_direction(self, a, _a, b, string_b, k_dim, return_cos_similarity):
        """
        determine the analogy of the given word based on an additive function of cosine similarities that maintains
        the ...

        Args:
            a,_a     - vector representation of the example of a word and its corresponding analogy
            b        - vecor representation of the string whose analogy is to be determined
            string_b - string whose analogy is to be determined

        Returns:
            analogy of given string based on given example and determined using cosine similarity
        """
        _b = _a - a

        sim_list = [(txt, cosine_similarity(vec.reshape(1,-1)-b,_b).reshape(1)[0]) for txt,vec in enumerate(self.embeddings)]

        return self.get_best_analogy(sim_list, string_b, return_cos_similarity)

    def get_analogy(self, string_a, analogy_a, string_b, k_dim=0, return_cos_similarity=False):
        """
        get analogous words using 3COSADD, PAIRDIRECTION, or 3COSMUL which make use of the cosine similarity of the embedding vectors.        
        adapted from: https://www.aclweb.org/anthology/W14-1618

        Args:
            string_a, analogy_a - example of a string and its analogy
            string_b - string whose analogy is to be determined
            embeddings_dict - dictionary of embeddings. uses existing embeddings if was pretrained
            return_cosine_similarity - whether or not output should include the analogy's cosine similarity
        
        Returns:
            analogy of given string based on given example and determined using cosine similarity
        """
        a, _a, b = (self.get_word_embedding(string).reshape(1,-1) for string in [string_a, analogy_a, string_b])
        
#         if self.compose_embeddings.comp_fn is None:
#             return self._3_cos_add(a, _a, b, string_b, k_dim, return_cos_similarity)
            
        return self._3_cos_mul(a, _a, b, string_b, k_dim, return_cos_similarity) 


TypeError: __init__() missing 1 required positional argument: 'word_count'

In [253]:
# this might take a minute
wrl = WordRelations(swe, embeddings, word_encoder, morpheme_encoder, wb)
wrl.get_analogy("ataenda", "alienda", "atakimbia")

w'tabasamu'

In [None]:
# this.... minutes
for word in ['ataenda', 'atacheza', 'ataanza', 'atabadilika']:    
    print(wrl.get_analogy('ataondoka', 'aliondoka', word))

alienda


In [None]:
## Language model