In [2]:
import sys
if "../" not in sys.path: sys.path.insert(0,"../");

## Building for the data module

## Processors

##### Testing

In [4]:
from pathlib import Path
from typing import Union, List
import os

from marynlp.text.processors.formatters import lowercase, remove_punctuations, white_space_cleaning
from marynlp.text.funcutils import forEach, filterBy, yield_forEach, apply, calls, rules

data_path = Path("../resources/data")
helsinki_na_path = data_path / Path("./hcs-na-v2")

# File to test out the concept
sample_file = helsinki_na_path / Path("./new-mat/bunge/han1-2004.shu")

@forEach(lowercase)  # performs the reformatting for each line
@filterBy(shu_file_line_rule)
@filterBy(helsinki_shufile_line_rule)
def load_lines_from_file(file_path: Union[str, os.PathLike]) -> List[str]:
    file_path = Path(file_path)
    assert file_path.exists(), "The file doesn't exits"
    
    with open(file_path, mode='r', encoding="utf8") as rb:
        return rb.readlines()

@yield_forEach(lowercase)  # performs the reformatting for each line
@filterBy(rules(shu_file_line_rule, helsinki_shufile_line_rule))
def gen_lines_from_file(file_path: Union[str, os.PathLike]) -> List[str]:
    file_path = Path(file_path)
    assert file_path.exists(), "The file doesn't exits"
    
    with open(file_path, mode='r', encoding="utf8") as rb:
        for line in rb:
            yield line

@apply(calls(lowercase, remove_punctuations))
def load_text_from_file(file_path: Union[str, os.PathLike]) -> str:
    file_path = Path(file_path)
    assert file_path.exists(), "The file doesn't exits"
    
    with open(file_path, mode='r', encoding="utf8") as rb:
        return rb.read()

# load_lines_from_file(sample_file)
# load_text_from_file(sample_file)
    
# for i in gen_lines_from_file(sample_file):
#     print(i)

## Flow the data

In [6]:
import re
from collections.abc import Callable
from typing import Union

from marynlp.text.data.objects import mask_token, token
from marynlp.text.processors.formatters import white_space_cleaning

def replace_text_to_token(input_: Union[str, token, mask_token], is_text: Callable[str], m_token: mask_token) -> Union[token, mask_token]:    
    # Check if the input is test
    if isinstance(input_, str):
        if is_text(input_):
            return m_token

    return input_
# str_.upper()
# replace_number("I have 3,000,000 million dollars") # fails
# replace_number("The distance is 23.45 kilometers") # fails

## Figuring out tokenization

Identifying how to tokenize a sentence. Doing so such that:

Original sentnece:
```
mwanafunzi anaenda shule
```

*Word level masking:*
```
mwanafunzi anaenda [MASK]
```
`[MASK]` - Masking that happens here is for 2 token (subword): `shule` -> `shu`, `le`

*Subword level masking:*
```
mwanafunzu ana<MASK> shule
```

In [19]:
import os
from typing import List, Union, Iterable, Optional
from marynlp.text.data.objects import mask_token, token

# objects
# ------------------------------------------

class Vocab(object):
    """
    This should represent the different information about the Vocabulary
    """
    UNK_TOKEN = mask_token('unk')
    NUM_TOKEN = mask_token('num')
    
    def __init__(self, list_tokens: Optional[List[token]] = None):
        if list_tokens is None:
            list_tokens = []

        self.tokens = list_tokens
        
    def add_token_list(self, token_list: List[token]):
        raise NotImplementedError()
    
    def has(self, token_: Union[str, token]) -> bool:
        """Checks if the vocab object has the token"""
        if isinstance(token_, str):
            token_ = token(token_)
            
        for tok in self.tokens:
            if tok.get() == token_.get():
                return True
            
        return False
    
    def get_tokens(self):
        return list(self.tokens)
    
    def __len__(self):
        return len(self.tokens)
    
    @classmethod
    def from_list(cls, list_str: List[str]):
        raise NotImplementedError()

    @classmethod
    def from_file(cls, file_path: Union[str, os.PathLike]):
        raise NotImplementedError()
    
    def extra_repr(self):
        # print the values
        if len(self) > 5:
            t = self.get_tokens()
            return "{}, ..., {}".format(", ".join(t[:2]), t[-1])
        
        return ", ".join(t)
    
    def __repr__(self):
        return 'Vocab(%s, count=%d)' % (self.extra_repr(), len(self))

In [21]:
from __future__ import annotations
from collections import defaultdict, OrderedDict
from typing import List, Tuple, Any, Union

class Mapper(object):
    def __init__(self, od: OrderedDict):
        self._od = od
        
    @property
    def ordered_dict():
        """Get the ordered dict"""
        return self._od
    
    def map_(self, key: str):
        if key not in self._od:
            raise KeyError("Mapping key '%s' doesn't exist in the Mapper" % key)

        return self._od[key]
    
    def add(self, key: str, value: Any):
        assert key not in self._od, "Mapping key '%s' already exists" % key
        self._od[key] = value
        pass
    
    @classmethod
    def from_list_tuple(cls, list_o_tuple: Union[zip, List[Tuple[str, Any]]]):
        return cls(OrderedDict(list_o_tuple))
    
    @classmethod
    def from_dict(cls, dict_: Dict[str, Any]):
        return cls(OrderedDict(dict_))
    
    def extra_repr(self):
        return list(self._od.items())
    
    def __repr__(self):
        return "Mapper({})".format(self.extra_repr())

class Encoder(Mapper):    
    @classmethod
    def from_decoder(cls, decoder: Decoder):
        return Encoder(decoder.ordered_dict)

class Decoder(Mapper):
    @classmethod
    def from_encoder(cls, encoder: Encoder):
        return Decoder(encoder.ordered_dict)
        pass

items = sorted(list(set('anaenda'))); items
mapper = Mapper.from_list_tuple(zip(items, range(len(items))))
mapper.add('r', 6)

mapper

Mapper([('a', 0), ('d', 1), ('e', 2), ('n', 3), ('r', 6)])

In [22]:
class Tokenizer():
    pass

## Building pipeline

In [23]:
# selectors: filters
# -----------------------

def shu_file_line_rule(text: str) -> bool:

    # Check if there is a text that has <text
    if text.find("<text") >= 0: return False

    # Check if there is a text that has </text>
    if text.find("</text>") >= 0: return False
    
    return True

# Making selection of data
def content_width_line_rule(text: str) -> bool:
    """
    Selectors to choose the lines that work for downstream processing
    """
    
    # if line is less than 20, done select for processing
    if len(text) < 20: return False

    return True

def break_text_to_sentences(text: str, max_length: int = 120):
    assert isinstance(text, str), "text should be string"
    _l = len(text)
    sentences = []
    
    ix, end = 0, 0
    for i in range(_l):
        if (i + 1) % max_length == 0:
            ix, end = end, i
            sentences.append(text[ix:end])

    # pass last sentence
    ix, end = end, _l
    sentences.append(text[ix:end])
    
    return sentences

In [24]:
from typing import Union, List
import os

from marynlp.text.processors.formatters import lowercase, remove_punctuations, white_space_cleaning
from marynlp.text import funcutils as f

@f.flowBy(break_text_to_sentences)
@f.forEach(lowercase)  # performs the reformatting for each line
@f.filterBy(f.rules(shu_file_line_rule, content_width_line_rule))
def load_lines_from_file(file_path: Union[str, os.PathLike]) -> List[str]:
    file_path = Path(file_path)
    assert file_path.exists(), "The file doesn't exits"
    
    with open(file_path, mode='r', encoding="utf8") as rb:
        return rb.readlines()

## Regular expresison


In [54]:
from marynlp.text.data.objects import sentence, mask_token, word
from marynlp.text import funcutils as f

from typing import Any, Union
from functools import partial

from collections.abc import Callable

import re

# Regular Expression
common_flags =  re.UNICODE | re.MULTILINE | re.DOTALL

# ------------------------------------
# CHECKING FOR NUMBER
# ------------------------------------

# regex_for_numbers = r'(?<!\S)(?=.)(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?!\S)|(\d+)'
number_re_ = re.compile(r'(\d+)', common_flags)

def is_number(input_: str): return number_re_.match(input_) is not None

# ------------------------------------
# CHECKING FOR SWAHILI WORD
# ------------------------------------

base_characters = 'abcdefghijklmnoprstuvwyz'
base_numbers = '0123456789'
base_word_non_letter_chars = '\'-'

r_sw_word = r'([{}{}{}{}]+)'.format(base_characters, base_characters.upper(), base_numbers, base_word_non_letter_chars)
word_re_ = re.compile(r_sw_word, common_flags)

def is_swahili_word(input_: str): return word_re_.match(input_) is not None

def tokenize_input(input_: str, apply_rule: Callable) -> token:
    if apply_rule(input_):
        return token(input_)
    
    return input_



# @f.apply(sentence)
@f.forEach(partial(tokenize_input, apply_rule=is_swahili_word))
@f.forEach(white_space_cleaning)
@f.filterBy(lambda x: len(x) > 0)
def extract_word(text: str) -> List[Union[str, token]]:
    return word_re_.split(text)

In [55]:
extract_word("kwa mujibu wa ibara ya 25 na 26 ya mkataba wa stockholm, mkataba huu ni lazima uridhiwe na nchi 50 ndipo utekelezaji wa")

[t'kwa',
 '',
 t'mujibu',
 '',
 t'wa',
 '',
 t'ibara',
 '',
 t'ya',
 '',
 t'25',
 '',
 t'na',
 '',
 t'26',
 '',
 t'ya',
 '',
 t'mkataba',
 '',
 t'wa',
 '',
 t'stockholm',
 ',',
 t'mkataba',
 '',
 t'huu',
 '',
 t'ni',
 '',
 t'lazima',
 '',
 t'uridhiwe',
 '',
 t'na',
 '',
 t'nchi',
 '',
 t'50',
 '',
 t'ndipo',
 '',
 t'utekelezaji',
 '',
 t'wa']

In [None]:
def is_mask_token(input_: Union[Any, mask_token]) -> bool: return isinstance(input_, mask_token)

# extract

def construct_token_from_text(text: str) -> sentence:
    pass
    
def tokenize_from_text(text: str):
    pass

In [27]:
from pathlib import Path

data_path = Path("../resources/data")
helsinki_na_path = data_path / Path("./hcs-na-v2")

# File to test out the concept
sample_file = helsinki_na_path / Path("./new-mat/bunge/han1-2004.shu")

lines_iter = load_lines_from_file(sample_file); list(lines_iter)

['miswada saba kati ya hiyo imekwishapata kibali cha mheshimiwa rais na sasa ni sheria za nchi.\n',
 'kwa mujibu wa ibara ya 25 na 26 ya mkataba wa stockholm, mkataba huu ni lazima uridhiwe na nchi 50 ndipo utekelezaji wa',
 'ke uanze.\n',
 'tuangalie vile vile zile athari za kemikali za zile nchi jirani ambazo tunapakana nazo na maji ya nchi zile yanaingia k',
 'atika taifa letu;\n',
 'viwanda sasa hivi ni vingi, biashara huria - mifuko hii ni mingi na bahati mbaya sasa hivi inawezekana imetusaidia kwa ',
 'sababu wakulima wengi sio wenye teknolojia ya hali ya juu, lakini huko mbele ya safari tunaamini kwamba tutawekeza wakul',
 'ima wa kimataifa ambao wataingiza zaidi kemikali kwa ajili ya mashamba yao.\n',
 'lakini mahali pa kuanzia ni katika mkataba wenyewe ukishaanza kufanya kazi.\n',
 'kemikali zote hizo zinafanyika kwa makusudi mengi.\n',
 'ndiyo maana kuanzia kilimo, kuanzia kutumia mbolea, kutumia dawa za kuua wadudu, kutumia dawa za kutunza mazao, zote ni',
 ' kemikali hizo.\