In [2]:
import sys
if "../" not in sys.path: sys.path.insert(0,"../");

In [12]:
## Setting up the needed to do anything
from pathlib import Path
from typing import Union, List, Iterable
import os

from marynlp.text.processors.formatters import lowercase, remove_punctuations, white_space_cleaning
from marynlp import funcutils as f

data_path = Path("../resources/data")
helsinki_na_path = data_path / Path("./hcs-na-v2")

# File to test out the concept
sample_file = helsinki_na_path / Path("./new-mat/bunge/han1-2004.shu")

def lowercase(text):
    return text.lower()

def ignore_rules(text: str):
    return not (text.find("<text") > -1 or text.find("</text>") > -1)

def should_be_longer_that_20(text: str):
    return len(text) > 20

@f.forEach(lowercase)
@f.filterBy(f.rules(should_be_longer_that_20, ignore_rules))
def load_file(file):
    with open(file, "r") as f:
        return f.readlines()
    
def save_to_file(file, content: Iterable[str]):
    with open(file, "w") as wf:
        for line in content:
            wf.write(line)
    
filtered_fn = f.filterBy(f.rules(should_be_longer_that_20, ignore_rules))(load_file)

folder_path = "../resources/operate_on"
!mkdir -p {folder_path}
save_to_file(f'{folder_path}/dummy.txt', filtered_fn(sample_file))

In [3]:
# Setting up the bucket to load the data
# -----------------------------------------

## Using the SED Morpheme template
from marynlp.utils.storage import download as dl
from marynlp.utils import storage

# setup the download bucket
bucket = storage.get_bucket("../resources/mary_africa_credentials_key.json", "marynlp-private")
morpheme_template_file = dl.file_from_google_to_store("models/sed_morpheme_template.txt", bucket); morpheme_template_file

PosixPath('/home/iam-kevin/.marynlp/store/models/sed_morpheme_template.txt')

## Regular

In [5]:
from experimental.sed import MorphologyAnalyzer
from typing import List, Tuple

analyzer = MorphologyAnalyzer(morpheme_template_file); analyzer

def break_word(word: str) -> Tuple[str]:
    return tuple([ su for su in analyzer.break_text([word])[word]])

# /text/tokenize[?type=sed] (default: sed)
break_word("nilienda")

('nili', 'e', 'nda')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

class WordRelations(object):
    def __init__(self, 
                 swe: SEDWordEmbeddings, 
                 embeddings: np.ndarray,
                 word_encoder: TokenEncoder,
                 morpheme_encoder: TokenEncoder,
                 word_breaker: WordBreaker
                ):
        self.swe = swe
        self.word_encoder = word_encoder
        self.morpheme_encoder = morpheme_encoder
        self.wb = word_breaker
        self.embeddings = embeddings
        
    def get_word_embedding(self, word: str) -> torch.Tensor:
        morphs_of_word = [ self.morpheme_encoder.encode(m) for m in self.wb.break_word(word)]
        tmw = torch.tensor([morphs_of_word])

        return self.swe.embed(tmw)

    def get_most_similar(self, string: str, sim_dict: int, threshold: int):
        """
        get most similar word(s) from collection of related words using the cosine similarity measure

        Args:
            string    - string whose most siilar words are to be obtained
            sim_dict  - dictionary of similar words
            threshold - minimum cosine similarity value for words to be considered most similar to string
                        if None then only word with highest cosine similarity is returned
        
        Returns:
            collection of most similar words as determined by their cosine similarity to the string being considered
        """

        cos_sim = [sim[1] for sim in sim_dict[string]]
        max_sim = max(cos_sim)

        if threshold is not None:
            assert max_sim>=threshold, 'threshold set too high, no similar words found'

            return [v for v in sim_dict[string] if v[1]>=threshold]

        return [v for v in sim_dict[string] if v[1]==max_sim]

    def get_similar_words(self, string, k_dim=0, threshold=None):
        """
        get collection of closely related words usnig the cosine similarity of their embedding vectors

        Args:
            string          - string whose related words are to be obtained
            embeddings_dict - dictionary of word embeddings. If embedder already trained uses existing embeddings.
            threshold       - minimum cosine similarity for word to be considered similar to given word

        Returns:
            dictionary of similar words and their similarity as measure by the cosine similarity between their embedding vectors
            and that of the string
        """
#         self.check_embeddings()
        val = self.get_word_embedding(string)
        
        sim_dict = {}
        sim_dict[string] = [(txt, cosine_similarity(val.reshape(1,-1), vec.reshape(1,-1)).reshape(1)[0]) for txt,vec in enumerate(self.embeddings) if txt!=string or not (vec==val).all()]
        
        most_similar = self.get_most_similar(string, sim_dict, threshold)
        sim_dict[string] = sorted(most_similar,key=lambda x: x[1], reverse=True)

        return sim_dict

    def get_best_analogy(self, sim_list, string_b, return_cos_similarity):
        """
        get most relevant analogy from collection of analogous words. uses cosine similarity measure to determine 
        the best analogy

        Args:
            sim_list - list of words similar to the given word
            string_b - word whose analogy is to be determined
            return_cosine_similarity - whether or not output should include the analogy's cosine similarity

        Returns:
            analogy of the given word
        """
        sorted_sim = sorted([sim for sim in sim_list if sim[1]>0], key=lambda x:x[1], reverse=True)
        max_sim = sorted([sim for sim in sim_list if sim[1]>0], key=lambda x:x[1], reverse=True)[0][0]
        
        if not return_cos_similarity:
            sorted_sim = [sim[0] for sim in sorted_sim]

        if max_sim == self.word_encoder.encode(string_b):
            return self.word_encoder.decode(sorted_sim[1])
        
        return self.word_encoder.decode(sorted_sim[0])

    def _3_cos_add(self, a, _a, b, string_b, k_dim, return_cos_similarity):
        """
        determine the analogy of the given word based on an additive function of cosine similarities

        Args:
            a,_a     - vector representation of the example of a word and its corresponding analogy
            b        - vecor representation of the string whose analogy is to be determined
            string_b - string whose analogy is to be determined

        Returns:
            analogy of the string based on given example and determined using cosine similarity
        """
        _b = b - a + _a

        sim_list = [(txt, cosine_similarity(vec.reshape(1,-1),_b).reshape(1)[0]) for txt,vec in enumerate(self.embeddings)]
  
        return self.get_best_analogy(sim_list, string_b, return_cos_similarity)

    def _3_cos_mul(self, a, _a, b, string_b, k_dim, return_cos_similarity, eps=0.001):
        """
        determine the analogy of the given word based on a multiplicative function of cosine similarities

        Args:
            a,_a     - vector representation of the example of a word and its corresponding analogy
            b        - vecor representation of the string whose analogy is to be determined
            string_b - string whose analogy is to be determined

        Returns:
            analogy of the string based on given example and determined using cosine similarity
        """
        
        sim_list = [(txt, (cosine_similarity(vec.reshape(1,-1),b).reshape(1)[0]*cosine_similarity(vec.reshape(1,-1),_a).reshape(1)[0])/(cosine_similarity(vec.reshape(1,-1),a).reshape(1)[0]+eps))\
                    for txt,vec in enumerate(self.embeddings)]
        return self.get_best_analogy(sim_list, string_b, return_cos_similarity)

    def pair_direction(self, a, _a, b, string_b, k_dim, return_cos_similarity):
        """
        determine the analogy of the given word based on an additive function of cosine similarities that maintains
        the ...

        Args:
            a,_a     - vector representation of the example of a word and its corresponding analogy
            b        - vecor representation of the string whose analogy is to be determined
            string_b - string whose analogy is to be determined

        Returns:
            analogy of given string based on given example and determined using cosine similarity
        """
        _b = _a - a

        sim_list = [(txt, cosine_similarity(vec.reshape(1,-1)-b,_b).reshape(1)[0]) for txt,vec in enumerate(self.embeddings)]

        return self.get_best_analogy(sim_list, string_b, return_cos_similarity)

    def get_analogy(self, string_a, analogy_a, string_b, k_dim=0, return_cos_similarity=False):
        """
        get analogous words using 3COSADD, PAIRDIRECTION, or 3COSMUL which make use of the cosine similarity of the embedding vectors.        
        adapted from: https://www.aclweb.org/anthology/W14-1618

        Args:
            string_a, analogy_a - example of a string and its analogy
            string_b - string whose analogy is to be determined
            embeddings_dict - dictionary of embeddings. uses existing embeddings if was pretrained
            return_cosine_similarity - whether or not output should include the analogy's cosine similarity
        
        Returns:
            analogy of given string based on given example and determined using cosine similarity
        """
        a, _a, b = (self.get_word_embedding(string).reshape(1,-1) for string in [string_a, analogy_a, string_b])
        
#         if self.compose_embeddings.comp_fn is None:
#             return self._3_cos_add(a, _a, b, string_b, k_dim, return_cos_similarity)
            
        return self._3_cos_mul(a, _a, b, string_b, k_dim, return_cos_similarity) 

    
# swe = 
# /text/analogies?a=wanacheza&b=atacheza[&type=sed] (default: sed)


## Voice