In [1]:
import sys
if "../" not in sys.path: sys.path.insert(0,"../");

In [4]:
from pathlib import Path
from typing import Union, List
import os

from marynlp.text.processors.formatters import lowercase, remove_punctuations, white_space_cleaning
from marynlp import funcutils as f

data_path = Path("../resources/data")
helsinki_na_path = data_path / Path("./hcs-na-v2")

# File to test out the concept
sample_file = helsinki_na_path / Path("./new-mat/bunge/han1-2004.shu")

def lowercase(text):
    return text.lower()

def ignore_rules(text: str):
    return not (text.find("<text") > -1 or text.find("</text>") > -1)

def should_be_longer_that_20(text: str):
    return len(text) > 20

@f.forEach(lowercase)
@f.filterBy(f.rules(should_be_longer_that_20, ignore_rules))
def load_file(file):
    with open(file, "r") as f:
        return f.readlines()
    
filtered_fn = f.filterBy(f.rules(should_be_longer_that_20, ignore_rules))(load_file)
# filtered_fn(sample_file)

# KaDemo

In [3]:
from pathlib import Path

data_path = Path("../resources/data")
helsinki_na_path = data_path / Path("./hcs-na-v2")

# File to test out the concept
sample_file = helsinki_na_path / Path("./new-mat/bunge/han1-2004.shu")

In [10]:
def not_have_html(text: str) -> bool:
    return not text.find('<text') > -1


def read_file(file):
    with open(file, "r") as f:
        return f.readlines()
    
# read_file(sample_file)
not_have_html("<text filename=\"Helsinki Corpus of Swahili/new-mat/bunge/han1-2004.shu\" title=\"Majadiliano ya Bunge\" year=\"200\"")

False

## Building for the data module

## Processors

##### Testing

In [5]:
from pathlib import Path
from typing import Union, List
import os

from marynlp.text.processors.formatters import lowercase, remove_punctuations, white_space_cleaning
from marynlp.funcutils import forEach, filterBy, yield_forEach, apply, calls, rules

data_path = Path("../resources/data")
helsinki_na_path = data_path / Path("./hcs-na-v2")

# File to test out the concept
sample_file = helsinki_na_path / Path("./new-mat/bunge/han1-2004.shu")

@forEach(lowercase)  # performs the reformatting for each line
# @filterBy(shu_file_line_rule)
# @filterBy(helsinki_shufile_line_rule)
def load_lines_from_file(file_path: Union[str, os.PathLike]) -> List[str]:
    file_path = Path(file_path)
    assert file_path.exists(), "The file doesn't exits"
    
    with open(file_path, mode='r', encoding="utf8") as rb:
        return rb.readlines()

@yield_forEach(lowercase)  # performs the reformatting for each line
# @filterBy(rules(shu_file_line_rule, helsinki_shufile_line_rule))
def gen_lines_from_file(file_path: Union[str, os.PathLike]) -> List[str]:
    file_path = Path(file_path)
    assert file_path.exists(), "The file doesn't exits"
    
    with open(file_path, mode='r', encoding="utf8") as rb:
        for line in rb:
            yield line

@apply(calls(lowercase, remove_punctuations))
def load_text_from_file(file_path: Union[str, os.PathLike]) -> str:
    file_path = Path(file_path)
    assert file_path.exists(), "The file doesn't exits"
    
    with open(file_path, mode='r', encoding="utf8") as rb:
        return rb.read()

# load_lines_from_file(sample_file)
# load_text_from_file(sample_file)
    
# for i in gen_lines_from_file(sample_file):
#     print(i)

## Flow the data

In [1]:
import re
from collections.abc import Callable
from typing import Union

from marynlp.text.data.objects import mask_token, token

def replace_text_to_token(input_: Union[str, token, mask_token], is_text: Callable[str], m_token: mask_token) -> Union[token, mask_token]:    
    # Check if the input is test
    if isinstance(input_, str):
        if is_text(input_):
            return m_token

    return input_
# str_.upper()
# replace_number("I have 3,000,000 million dollars") # fails
# replace_number("The distance is 23.45 kilometers") # fails

ModuleNotFoundError: No module named 'marynlp'

## Figuring out tokenization

Identifying how to tokenize a sentence. Doing so such that:

Original sentnece:
```
mwanafunzi anaenda shule
```

*Word level masking:*
```
mwanafunzi anaenda [MASK]
```
`[MASK]` - Masking that happens here is for 2 token (subword): `shule` -> `shu`, `le`

*Subword level masking:*
```
mwanafunzu ana<MASK> shule
```

In [7]:
import os
from typing import List, Union, Iterable, Optional
from marynlp.text.data.objects import mask_token, token

# objects
# ------------------------------------------

class Vocab(object):
    """
    This should represent the different information about the Vocabulary
    """
    UNK_TOKEN = mask_token('unk')
    NUM_TOKEN = mask_token('num')
    
    def __init__(self, list_tokens: Optional[List[token]] = None):
        if list_tokens is None:
            list_tokens = []

        self.tokens = list_tokens
        
    def add_token_list(self, token_list: List[token]):
        raise NotImplementedError()
    
    def has(self, token_: Union[str, token]) -> bool:
        """Checks if the vocab object has the token"""
        if isinstance(token_, str):
            token_ = token(token_)
            
        for tok in self.tokens:
            if tok.get() == token_.get():
                return True
            
        return False
    
    def get_tokens(self):
        return list(self.tokens)
    
    def __len__(self):
        return len(self.tokens)
    
    @classmethod
    def from_list(cls, list_str: List[str]):
        raise NotImplementedError()

    @classmethod
    def from_file(cls, file_path: Union[str, os.PathLike]):
        raise NotImplementedError()
    
    def extra_repr(self):
        # print the values
        if len(self) > 5:
            t = self.get_tokens()
            return "{}, ..., {}".format(", ".join(t[:2]), t[-1])
        
        return ", ".join(t)
    
    def __repr__(self):
        return 'Vocab(%s, count=%d)' % (self.extra_repr(), len(self))

In [8]:
from __future__ import annotations
from collections import defaultdict, OrderedDict
from typing import List, Tuple, Any, Union

class Mapper(object):
    def __init__(self, od: OrderedDict):
        self._od = od
        
    @property
    def ordered_dict():
        """Get the ordered dict"""
        return self._od
    
    def map_(self, key: str):
        if key not in self._od:
            raise KeyError("Mapping key '%s' doesn't exist in the Mapper" % key)

        return self._od[key]
    
    def add(self, key: str, value: Any):
        assert key not in self._od, "Mapping key '%s' already exists" % key
        self._od[key] = value
        pass
    
    @classmethod
    def from_list_tuple(cls, list_o_tuple: Union[zip, List[Tuple[str, Any]]]):
        return cls(OrderedDict(list_o_tuple))
    
    @classmethod
    def from_dict(cls, dict_: Dict[str, Any]):
        return cls(OrderedDict(dict_))
    
    def extra_repr(self):
        return list(self._od.items())
    
    def __repr__(self):
        return "Mapper({})".format(self.extra_repr())

class Encoder(Mapper):    
    @classmethod
    def from_decoder(cls, decoder: Decoder):
        return Encoder(decoder.ordered_dict)

class Decoder(Mapper):
    @classmethod
    def from_encoder(cls, encoder: Encoder):
        return Decoder(encoder.ordered_dict)
        pass

items = sorted(list(set('anaenda'))); items
mapper = Mapper.from_list_tuple(zip(items, range(len(items))))
mapper.add('r', 6)

mapper

Mapper([('a', 0), ('d', 1), ('e', 2), ('n', 3), ('r', 6)])

In [9]:
class Tokenizer():
    pass

## Building pipeline



In [7]:
from marynlp.text.data.objects import mask_token, token

def tokenize_input(input_: str, apply_rule: Callable) -> token:
    if apply_rule(input_):
        return token(input_)
    
    return input_

def mask_input(input_: str, apply_rule: Callable, mt_: mask_token) -> mask_token:
    if apply_rule(input_):
        return mt_
    
    return input_

## Regular expresison


In [42]:
from marynlp.text.data.objects import sentence, mask_token, word, token
from marynlp import funcutils as f

from typing import Any, Union
from functools import partial

from collections.abc import Callable

import re

# Regular Expression
common_flags =  re.UNICODE | re.MULTILINE | re.DOTALL

# ------------------------------------
# CHECKING FOR NUMBER
# ------------------------------------

# regex_for_numbers = r'(?<!\S)(?=.)(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?!\S)|(\d+)'
number_re_ = re.compile(r'(\d+)', common_flags)

def is_number(input_: str): return number_re_.match(input_) is not None

# ------------------------------------
# CHECKING FOR SWAHILI WORD
# ------------------------------------

base_characters = 'abcdefghijklmnoprstuvwyz'
base_numbers = '0123456789'
base_word_non_letter_chars = '\'-'

r_sw_word = r'([{}{}{}{}]+)'.format(base_characters, base_characters.upper(), base_numbers, base_word_non_letter_chars)
word_re_ = re.compile(r_sw_word, common_flags)

def is_swahili_word(input_: str): return word_re_.match(input_) is not None


# ------------------------------------
# CHECKING FOR PUNCTUATION
# ------------------------------------
punct_re_ = re.compile(r"\W+", common_flags)
def is_punctuation(input_: str): return punct_re_.match(input_) is not None


In [16]:
from marynlp.text.processors.formatters import white_space_cleaning

def is_mask_token(input_: Union[Any, mask_token]) -> bool: return isinstance(input_, mask_token)

@f.forEach(partial(tokenize_input, apply_rule=is_swahili_word), type_=sentence)
@f.filterBy(lambda x: len(x) > 0)
@f.forEach(white_space_cleaning)
def construct_token_from_text(text: str) -> sentence:
    return word_re_.split(text)
    

is_ignore_rule = f.rules(is_mask_token, is_punctuation)
@f.apply(sentence)
# @f.forEach(partial(mask_input, apply_rule=(lambda w: not is_swahili_word(w)), mt_=Vocab.UNK_TOKEN), skip_rule=is_ignore_rule)  # mask invalid
@f.forEach(partial(mask_input, apply_rule=is_number, mt_=Vocab.NUM_TOKEN))  # mask numbers
@f.filterBy(lambda x: len(x) > 0)
@f.forEach(white_space_cleaning)
def mask_construct_token_from_text(text: str) -> sentence:
    return word_re_.split(text)

NameError: name 'Vocab' is not defined

In [40]:
??f.rules

[0;31mSignature:[0m [0mf[0m[0;34m.[0m[0mrules[0m[0;34m([0m[0;34m*[0m[0mfns[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mcollections[0m[0;34m.[0m[0mabc[0m[0;34m.[0m[0mCallable[0m[0;34m][0m[0;34m,[0m [0mop_[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'and'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mrules[0m[0;34m([0m[0;34m*[0m[0mfns[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mCallable[0m[0;34m][0m[0;34m,[0m [0mop_[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0mRules[0m[0;34m.[0m[0mAND[0m[0;34m)[0m[0;34m:[0m    [0;34m[0m
[0;34m[0m    [0;34m"""This is useful if the `fns` are 'selectors'"""[0m   [0;34m[0m
[0;34m[0m        [0;34m[0m
[0;34m[0m    [0;32mif[0m [0mop_[0m [0;32mis[0m [0;32mNone[0m [0;32mor[0m [0mop_[0m [0;32mis[0m [0mRules[0m[0;34m.[0m[0mAND[0m[0;34m:[0m    [0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0mpartial[0m[0;34m([0m[0m_andSelector[0m[0;3

In [4]:
is_ignore_rule = f.rules(is_mask_token, is_punctuation, op_=f.Rules.OR)
is_ignore_rule(".")

NameError: name 'is_mask_token' is not defined

In [42]:
construct_token_from_text("kwa mujibu wa ng'ombe ya 25 na 26 ya mkataba wa stockholm, mkataba huu ni lazima uridhiwe na nchi 50 ndipo utekelezaji wa")

sentence('kwa mujibu wa ng'ombe ya 25 na 26 ya mkataba wa stockholm , mkataba huu ni lazima uridhiwe na nchi 50 ndipo utekelezaji wa', l=24)

In [43]:
mask_construct_token_from_text("kwa mujibu wa ng'ombe ya 25 na 26 ya mkataba wa stockholm, mkataba huu ni lazima uridhiwe na nchi 50 ndipo utekelezaji wa")

sentence('kwa mujibu wa ng'ombe ya <num> na <num> ya mkataba wa stockholm , mkataba huu ni lazima uridhiwe na nchi <num> ndipo utekelezaji wa', l=24)

In [None]:
## Help

In [8]:
from collections.abc import Mapping
from collections import OrderedDict

class FrozenDict(Mapping):
    def __init__(self, *args, **kwargs):
        self._d = OrderedDict(*args, **kwargs)
        self._hash = None # to memoize hash
        
    def as_dict(self):
        return dict(**self._d)

    def __iter__(self):
        return iter(self._d)

    def __len__(self):
        return len(self._d)

    def __getitem__(self, key):
        return self._d[key]

    def __setitem__(self, *args, **kwargs):
        raise ValueError("Forbidden action. This is a frozen object")

    def __hash__(self):
        if self._hash is None:
            hash_ = 0
            for pair in self.items():
                hash_ ^= hash(pair)
            self._hash = hash_
        return self._hash

## Vocabulary

This is preferably an immutable object that contains the linguistic information about the data that is being dealt with.

This knows information about:
- The `token`s that are valid for a language.
- The `mask_token`s that are used in the language (if any)
- The `compoundToken`s that are used in the language
- The `separator` for spliting up words
- The rules? to deal with sentences/words/tokens in the lanugage

In [9]:
from __future__ import annotations
"""
According to swahili
-------------------
"""
from typing import Iterable, Dict, Tuple, Optional, List

DEFAULT_SEPARATOR = ""

def _repr_textarize(str_list: List[str], limit:int=3):
    if len(str_list) > limit:
        return "%s, ..., %s" % (", ".join(str_list[:limit]), str_list[-1])
    
    return ", ".join(str_list[:limit])


class Vocabulary(object):
    """
    This must be a frozen object
    """
    def __init__(self, tokens: Iterable[str], mask_tokens: Optional[Dict[str, to.mask_token]] = None):
        self.separator = DEFAULT_SEPARATOR # This is the separator for the word
        self._tk_ls = tuple(tokens)
        
        self._mt_dict = FrozenDict()
        if mask_tokens is not None:
            self._mt_dict = FrozenDict(**mask_tokens)
            
        # get the shape of vocabulary
        self.shape = (len(self._tk_ls), len(self._mt_dict))

        # For hashing
        self._hash = None
        
        # For representation
        self._repr_tokens = None

    @property
    def tokens(self):
        return set(self._tk_ls)
    
    @property
    def masks(self):
        return self._mt_dict
    
    def has(self, token_: str):
        return self.has_token(to.token(token_))
    
    def has_token(self, token: to.token):
        return token in self._tk_ls
    
    def has_mask(self, key: str):
        return key in self._mt_dict
    
    def mask(self, key: str):
        try:
            return self._mt_dict[key]
        except KeyError:
            raise KeyError("Invalid '{0}' mask identifier. Was the mask='{0}' registered?".format(key))
    
    def mutate_tokens(self, tokens: Iterable[str]) -> Vocabulary:
        """.add alternative"""
        token_set = set(self._tk_ls).union(set(map(to.token, tokens)))
        mask_token_dict = self._mt_dict.as_dict()
        return Vocabulary(token_set, mask_token_dict)
    
    def mutate_mask(self, mask_tokens: Dict[str, to.mask_token]) -> Vocabulary:
        """.add_mask alternative"""
        token_set = set(self._tk_ls) 
        mask_token_dict = self._mt_dict.as_dict()
        mask_token_dict.update(mask_tokens)
        return Vocabulary(token_set, mask_token_dict)
    
    def save(self, vocab_file_name: str, base_path: str = "./"):
        """Saves the data"""
        pass
    
    @classmethod
    def load_from_file(cls, vocab_file_path: str):
        """Load the file"""
        pass
    
    def extra_repr(self) -> Tuple[str, tuple]:
        if self._repr_tokens is None:
            tokens_ = sorted(self._tk_ls)
            masks_ = list(self._mt_dict.as_dict().values())
            self._repr_tokens = "tokens={%s}, masks={%s}" % (_repr_textarize(tuple(map(str, tokens_))), _repr_textarize(tuple(map(str, masks_))))
        
        return self._repr_tokens, str(self.shape)
    
    def __repr__(self):
        r_str_, shape = self.extra_repr()
        return "Vocabulary(%s, shape=%s)" % (r_str_, shape)
    
    def __hash__(self):
        if self._hash is None:
            tk = tuple(sorted(self._tk_ls))
            mk = tuple(self._mt_dict)
            self._hash = hash((tk, mk))
            
        return self._hash

In [10]:
from marynlp.text.data.objects import sentence, mask_token, word, token
from marynlp import funcutils as f

from typing import Any, Union
from functools import partial

from collections.abc import Callable

import re

# Regular Expression
common_flags =  re.UNICODE | re.MULTILINE | re.DOTALL

# ------------------------------------
# CHECKING FOR NUMBER
# ------------------------------------

# regex_for_numbers = r'(?<!\S)(?=.)(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?!\S)|(\d+)'
number_re_ = re.compile(r'(\d+)', common_flags)

def is_number(input_: str): return number_re_.match(input_) is not None

In [12]:
import re
from functools import partial

from typing import List
from marynlp.text.data import objects as to
from marynlp.text.processors import formatters as fmt
from marynlp import funcutils as f
# selectors: filters
# -----------------------

def shu_file_line_rule(text: str) -> bool:

    # Check if there is a text that has <text
    if text.find("<text") >= 0: return False

    # Check if there is a text that has </text>
    if text.find("</text>") >= 0: return False
    
    return True

# Making selection of data
def content_width_line_rule(text: str) -> bool:
    """
    Selectors to choose the lines that work for downstream processing
    """
    
    # if line is less than 20, done select for processing
    if len(text) < 20: return False

    return True

def break_text_to_sentences(text: str, max_length: int = 120):
    assert isinstance(text, str), "text should be string"
    _l = len(text)
    sentences = []
    
    ix, end = 0, 0
    for i in range(_l):
        if (i + 1) % max_length == 0:
            ix, end = end, i
            sentences.append(text[ix:end])

    # pass last sentence
    ix, end = end, _l
    sentences.append(text[ix:end])
    
    return sentences


def is_mask_token(input_: Union[Any, to.mask_token]) -> bool: 
    """Checks if the input is a mask token"""
    return isinstance(input_, to.mask_token)

def mask_input(input_: str, apply_rule: Callable, mt_: to.mask_token) -> to.mask_token:
    if apply_rule(input_):
        return mt_
    
    return input_

@f.forEach(to.token, skip_rule=is_mask_token)
@f.forEach(partial(mask_input, apply_rule=is_number, mt_=to.mask_token("num")))
def split_by_space(text: str) -> List[Union[to.token, to.mask_token]] :
    """Breaks a long text into tokens"""    
    # function to clean the text
    clean_text_fn = fmt.white_space_cleaning
    return re.split(r"\s+", clean_text_fn(text))

In [13]:
split_by_space("lowercase, remove_punctuations, white_space_cleaning 45")

# vocab = Vocabulary()

[t'lowercase,', t'remove_punctuations,', t'white_space_cleaning', <num>]

## Tokenizer

This is the took that uses the information in the `ocabulary` to transform the texts accordingly. All transformations are done by the tokenizer

In [15]:
from __future__ import annotations

from collections import Callable
from typing import Optional, Iterable, Union

from marynlp.text.data import objects as t
from marynlp.text.data.objects import sentence, mask_token, word, token
from marynlp import funcutils as f

from typing import Any, Union
from functools import partial, wraps

import re

# Regular Expression
common_flags =  re.UNICODE | re.MULTILINE | re.DOTALL

# ------------------------------------
# CHECKING FOR NUMBER
# ------------------------------------

# regex_for_numbers = r'(?<!\S)(?=.)(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?!\S)|(\d+)'
number_re_ = re.compile(r'(\d+)', common_flags)

def is_number(input_: str): return number_re_.match(input_) is not None

# processors        
def text_tokenize(text: str, formatter_fn: Optional[Callable] = None) -> List[str]:
    """Breaks a long text into tokens"""    
    # function to clean the text
    if formatter_fn is not None:
        text = formatter_fn(text)

    # import when used
    import re
    return re.split(r"\s+", text)


class Tokenizer(object):
    """Tokenizer, uses vocab for reference"""
    def __init__(self, vocab: Vocabulary):
        self.vocab = vocab

        # check that there is mask for unknown words
        assert self.vocab.has_mask('unk'), "Mask for unknown token missing"
            
        
    @classmethod
    def token_for_masking(cls, token_: str) -> Union[Dict[str, to.mask_token], str]:
        """overridable"""
        if is_number(token_):
            return { "number": to.mask_token('num') }
        
        return token_
    
    def t(self, transform_fn):
        """overridable"""
        return f.forEach(partial(mask_input, apply_rule=is_number, mt_=self.vocab.mask('number')))(transform_fn)
    

    def _is_not_in_vocab(self, input_: str) -> bool:
        print(input_)
        return not self.vocab.has_token(to.token(input_))
    
    def transform(self, transform_fn: Callable):
        @wraps(transform_fn)
        def wrapper(*args, **kwargs):
            f_ = self.t(transform_fn)
            # final transform if missing
            f_ = f.forEach(partial(mask_input, apply_rule=self._is_not_in_vocab, mt_=self.vocab.mask('unk')), skip_rule=is_mask_token)(f_)
            
            return f_(*args, **kwargs)
        return wrapper
    
    # final
    @classmethod
    def initialize(cls, token_iterable_: Iterable[str], unknown_token: str = 'unk') -> Tokenizer:
        w = set()
        mk = dict()
    
        for token_ in token_iterable_:
            # check to mask as number
            out = cls.token_for_masking(token_)

            if isinstance(out, str):
                w.add(to.token(token_))
            else:
                mk.update(out)

        vocab = Vocabulary(w, mk).mutate_mask({ "unk": to.mask_token(unknown_token) })
        return cls(vocab), vocab

# or

def tokenizer(vocab: Vocabulary, text_splitter: Callable):
    def text_tokenize(text: str):
        return re.split(r"\s+", text)
    return text_tokenize


## Defining pipeline

In [16]:
from typing import Union, List
import os

from marynlp.text.processors import formatters as fmt # lowercase, remove_punctuations, white_space_cleaning
from marynlp import funcutils as f


@f.forEach(f.calls(fmt.lowercase, fmt.white_space_cleaning))  # performs the reformatting for each line
@f.filterBy(f.rules(shu_file_line_rule, content_width_line_rule))
def load_lines_from_file(file_path: Union[str, os.PathLike]) -> List[str]:
    file_path = Path(file_path)
    assert file_path.exists(), "The file doesn't exits"
    
    with open(file_path, mode='r', encoding="utf8") as rb:
        return rb.readlines()
    
    # tokenize the texts
load_tokens_from_file = f.flowBy(text_tokenize)(load_lines_from_file)

In [17]:
from pathlib import Path

data_path = Path("../resources/data")
helsinki_na_path = data_path / Path("./hcs-na-v2")

# File to test out the concept
sample_file = helsinki_na_path / Path("./new-mat/bunge/han1-2004.shu")

In [18]:
from marynlp import funcutils as f
from marynlp.text.data import objects as to

tokenizer, vocab = Tokenizer.initialize(load_tokens_from_file(sample_file))
vocab

Vocabulary(tokens={'kutakuwa, 'mnyamwezi', 'mzigo, ..., zuri.}, masks={<num>, <unk>}, shape=(5419, 2))

In [19]:
tkrz = tokenizer.transform(text_tokenize)
tkrz("asdasd 4545 32423 kevin mzigo")

asdasd
kevin
mzigo


[<unk>, <num>, <num>, <unk>, 'mzigo']