In [1]:
import sys
if "../" not in sys.path: sys.path.insert(0,"../");

In [2]:
# Training for the SED
from marynlp.modules.source import save_model_from_google_bucket
from marynlp.utils.storage import download, get_bucket


# Download the morphmeme file
bucket = get_bucket("../resources/mary_africa_credentials_key.json", "marynlp-private")
morph_template_file = save_model_from_google_bucket("models/sed_morpheme_template.txt", bucket)


In [3]:
morph_template_file

PosixPath('/home/iam-kevin/.marynlp/store/models/sed_morpheme_template.txt')

In [4]:
# Prepare the words
from marynlp import funcutils as f
from tqdm import tqdm

from typing import List, Union
import os
import re

def split_by_space(text: str) -> List[str]:
    """Split a text to word strings
    
    Example
    "Lorem ipsum" -> [ 'Lorem', 'ipsum' ] 
    """
    return re.split(r"\s+", text)

def ignore_rules(text: str) -> bool:
    return not (text.find("<text") > -1 or text.find("</text>") > -1)

def should_be_longer_than_20(text: str) -> bool:
    return len(text) > 20

@f.filterBy(f.rules(ignore_rules, should_be_longer_than_20))
def read_file(file_path) -> List[str]:
    with open(file_path, "r", encoding="utf-8") as rb:
        return rb.readlines()

@f.forEach(str, type_=set)
@f.filterBy(lambda s: len(s.strip()) > 0)
@f.flowBy(split_by_space)
@f.forEach(lambda s: s.strip())
def get_unique_words_from_shu_file(file_path: os.PathLike):
    return tqdm(read_file(file_path))


In [5]:
from torch.utils.data import Dataset
from typing import Iterable, Tuple

from experimental.sed import MorphologyAnalyzer
from marynlp import funcutils as f 

from collections.abc import Callable

def break_word(word: str, analyzer: MorphologyAnalyzer) -> Tuple[str]:
    return tuple([ su for su in analyzer.break_text([word])[word]])

# Might not want to add this since it's pytorch specific?
class MorphemeDataset(Dataset):
    def __init__(self, word_iter: Iterable[str], break_word: Callable):
        self.break_word = break_word
        self.wml = list(word_iter)
        
    def __getitem__(self, index: int):
        return self.break_word(self.wml[index])
    
    def __len__(self):
        return len(self.wml)

In [6]:
# unique_morphemes

In [7]:
from marynlp.text.data import Vocab, morph
from marynlp.text import formatter as fmt
from itertools import chain

from tqdm import tqdm
# from marynlp.utils import process as p

unique_words = get_unique_words_from_shu_file("../resources/data/hcs-na-v2/new-mat/news/alasiri-2009.shu")
analyzer = MorphologyAnalyzer(morph_template_file)

# start with remove punctuations
clean_text = f.calls(fmt.remove_punctuations, fmt.white_space_cleaning, fmt.lowercase)

@f.apply(f.calls(chain.from_iterable, set))
def get_unique_morphemes(words: Iterable[str]):
    breaker = f.calls(clean_text, f.partial(break_word, analyzer=analyzer))
    for w in tqdm(words):
        try:
            yield breaker(w)
        except Exception as e:
            print("Unable to break word: '%s'" % w)
            raise e

            
# unique_morphemes = get_unique_morphemes(unique_words)
# morpheme_vocab = Vocab(unique_morphemes)
# word_vocab = Vocab(map(clean_text, unique_words))
# dataset = MorphemeDataset(unique_words, break_word=f.partial(break_word, analyzer=analyzer))
# clean_text("sisi?,!")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 763155.75it/s]


In [8]:
#  = f.partial(break_word, analyzer=analyzer)
# get_unique_morphemes("kevin james")

In [9]:
# NOTE: This must be updated the data
#  in accordance to the data that is used to train the data

# word_vocab, morpheme_vocab # Vocab([ , 0..., zwa], len=569)

In [10]:
from experimental.sed.nn import SEDLanguageModel
from experimental.sed.modules.embeddings import SEDWordEmbeddings

# swe = SEDWordEmbeddings(
#     morpheme_vocab.size,
#     embedding_dim=100,
#     hidden_dim=32
# )

def lm_pad_collate(batch):
    x, x_len = [], []
    
    x_len = [m.shape[0] for m in batch]
    max_len = max(x_len)
    
    x_batch_ = [np.pad(t, (0, max_len - t.shape[0])) if t.shape[0] < max_len else t for t in batch]
    xx = torch.stack([torch.from_numpy(x).long() for x in x_batch_])

    return xx, torch.as_tensor(x_len)

In [11]:
from torch.utils.data import DataLoader
import re
import numpy as np
import torch

from marynlp.text.data import token
from marynlp.text.processor import TokenEncoder

from marynlp.text import formatter as fmt
from itertools import chain

from typing import List, Any


def split_text_by_space(text: str) -> Tuple[str]:
    return re.split(r'\s+', fmt.white_space_cleaning(text)) 


@f.apply(list)
def text_pad_sequence(word_sequence: Iterable[Any], padding_length: int, pad_marker: Any):
    if len(word_sequence) >= padding_length:
        return word_sequence
    
    return chain.from_iterable((word_sequence, [pad_marker] * (padding_length - len(word_sequence))))

@f.apply(tuple)
@f.apply(lambda o: zip(*o))
def width_pad_sequence(var_sequences: Iterable[List[Any]], padding_length: int, pad_marker: Any):
    for var_seq in var_sequences:
        out = tuple(var_seq)
        yield tuple(chain.from_iterable((out, [pad_marker] * (padding_length - len(var_seq))))), len(out)

        
# PAD_TOKEN = token('<PAD>')


# morph_encoder = TokenEncoder(morpheme_vocab)
# word_to_morpheme = f.partial(break_word, analyzer=analyzer)
# encode_morpheme = f.apply(np.array)(f.forEach(morph_encoder.encode)(word_to_morpheme))


# def pad_collateV2(batch):
#     """NOTE: This is the modified version from the package"""
#     x, x_len = [], []
    
#     x_len = [m.shape[0] for m in batch]
#     max_len = max(x_len)
    
#     x_batch_ = [np.pad(t, (0, max_len - t.shape[0])) if t.shape[0] < max_len else t for t in batch]
#     xx = torch.stack([torch.from_numpy(x).long() for x in x_batch_])

#     return xx, torch.as_tensor(x_len)



# @f.apply(list)
def morph_sentence_pad_collate(sentences: List[str], padding_idx: int = 0):
    word_sequences = list(map(split_text_by_space, sentences))
#     print(word_sequences)
    
    # padd each sequence
    out_list = [list(map(word_to_morpheme, b)) for b in word_sequences]
    print(out_list)

    # get the longest character sequence in the entire batch
    to_pad_morph_length = len(max(list(map(f.partial(max, key=len), out_list)), key=len))
    to_pad_word_length = max(map(len, out_list))

    morph_padd_function = f.partial(width_pad_sequence, padding_length=to_pad_morph_length, pad_marker=padding_idx)
    
#         print("m_idx_seq", padd_function(m_idx_seq))
    
    # hold the number of words and morphme pair count
    word_morphemes_count = []
    words_tensors = []
    words_c = []  # number of words in a sentence

    for b in out_list:
        word_list = [ list(map(morph_encoder.encode, b_)) for b_ in b ]
#         print(word_list)
        
        padded_morph_sequence, lengths = morph_padd_function(word_list)
        
#         c, _ = morpheme_pad_collate(list(map(encode_morpheme, b)))
#         word_count, longest_token_len = c.shape[0], c.shape[-1]

        # grid padding
        word_count = len(word_list)
        padd_words_count = to_pad_word_length - word_count
        out = (tuple([0] * to_pad_morph_length) for _ in range(padd_words_count))    
        
        words_tensors.append(np.array(list(chain(padded_morph_sequence, out))))
#         word_morphemes_count.append(np.array(tuple(chain(lengths, [0] * padd_words_count))))
        word_morphemes_count.append(
            tuple(chain(
                lengths, 
                [0] * padd_words_count # padding
            )))
    
        words_c.append(word_count)

    padded_length_tensors = np.array(word_morphemes_count)
#         # get the remaining shape
# #         print(c)
#     print(out)
                             
    # padd the sentence objects
#     padded_length_tensors = pad_sequence(word_morphemes_count, batch_first=True)

    return np.array(words_tensors), np.array(padded_length_tensors), np.array(words_c)

# def padded_input(pad_idx: int, padding_length: int):
#     return np.array([pad_idx] * padding_length)


# t, l = morpheme_pad_collate(list(MorphemeDataset(["mama", "anakuja"], encode_morpheme))); t

# split sentence
# lm = DataLoader()
# sentences = ["mama anakuja na mama", "mwalimu alikuwa anafundisha"]
# t, mcs, wc = morph_sentence_pad_collate(sentences); 
# t.shape, mcs.shape, wc.shape

# # set_pad = 
# # print(set_pad)
# t[1], mcs[1], wc[1]

In [12]:
analyzer 

<experimental.sed.morphology.MorphologyAnalyzer at 0x7fa50898fd10>

### Showing Nelson how its done

In [13]:
from marynlp import funcutils as f
from typing import Tuple

import os

from tqdm import tqdm
from itertools import chain

def split_text_by_space(text: str) -> Tuple[str]:
    return re.split(r'\s+', fmt.white_space_cleaning(text)) 

@f.forEach(f.calls(lambda s: s.strip(), split_text_by_space))
def get_word_sequences_from_txt_file(file_path: os.PathLike):
    return tqdm(read_file(file_path))

@f.forEach(str, type_=set)
def get_unique_words_from_txt_file(file_path: os.PathLike):
    return chain.from_iterable(get_word_sequences_from_txt_file(file_path))

In [82]:
from marynlp.text.data import Vocab

from itertools import chain

unique_words = get_unique_words_from_txt_file("../resources/nelson/train.txt") | get_unique_words_from_txt_file("../resources/nelson/valid.txt")
# unique_words = set(list(chain.from_iterable(list(map(split_text_by_space, sentences)))))
# print(unique_words)
unique_morphemes = get_unique_morphemes(unique_words)

# print(unique_morphemes)
word_vocab = Vocab(unique_words)
morpheme_vocab = Vocab(unique_morphemes)


word_vocab, morpheme_vocab

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41515/41515 [00:00<00:00, 93767.90it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3347/3347 [00:00<00:00, 97418.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29343/29343 [01:18<00:00, 374.16it/s]


(Vocab([UNK, a..., zurich], len=29343), Vocab([a, aa..., zzy], len=5230))

In [83]:
from marynlp.text.processor import PaddedTokenEncoder, 

word_tokenizer = PaddedTokenEncoder(word_vocab)
morpheme_tokenizer = PaddedTokenEncoder(morpheme_vocab)

In [84]:
word_tokenizer, morpheme_tokenizer

(TokenEncoder(<PAD>=0, <UNK>=1..., zurich=29344, l=29345),
 TokenEncoder(<PAD>=0, <UNK>=1..., zzy=5231, l=5232))

In [85]:
# Fit the embeddings
import numpy as np
from experimental.sed.modules.embeddings import SEDWordEmbeddings

def break_word(word: str, analyzer: MorphologyAnalyzer) -> Tuple[str]:
    return tuple([ su for su in analyzer.break_text([word])[word]])

analyzer = MorphologyAnalyzer(morph_template_file)


swe = SEDWordEmbeddings(
    morpheme_tokenizer.size,
    embedding_dim=100,
    hidden_dim=32
)

word_to_morpheme = f.partial(break_word, analyzer=analyzer)
encode_morpheme = f.apply(np.array)(f.forEach(morpheme_tokenizer.encode)(word_to_morpheme))
morpheme_dataset = MorphemeDataset(word_tokenizer.get_tokens(), break_word=encode_morpheme)

In [86]:

# for i in morpheme_dataset:
#     print(i)
embeddings = swe.fit(morpheme_dataset)
embeddings = torch.tensor(embeddings)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [01:55<00:00,  1.01s/it]


In [88]:
embeddings.shape, word_tokenizer


(torch.Size([29345, 100]),
 TokenEncoder(<PAD>=0, <UNK>=1..., zurich=29344, l=29345))

### Finally, the LM

In [89]:
from typing import Any

def next_token_generate(word_sequence: Iterable[Any]) -> Tuple[List[Any], Any]:
    word_sequence = tuple(word_sequence)
    return word_sequence[:-1], word_sequence[-1]
            
def next_token_sequence_generator(word_sequence: Iterable[Any]) -> Iterable[Tuple[List[Any], Any]]:
    return [next_token_generate(word_sequence)]

from functools import wraps
from itertools import chain

def flow(iterator_fn):
    @wraps(iterator_fn)
    def wrapper(*args, **kwargs):
        return chain.from_iterable(iterator_fn(*args, **kwargs))
    return wrapper

@f.apply(list)
@flow
def generate_word_next_token_sequence(word_sequences: Iterable[List[str]]):
    for w in word_sequences:
        yield next_token_sequence_generator(w)


In [90]:
from torch.utils.data import Dataset
from collections.abc import Callable

class LMDataset(Dataset):
    def __init__(self, word_sequences: Iterable[Tuple[List[str], str]], word_encode: Callable = None):
        self.ls = word_sequences
        self.encode = word_encode
    
    def __getitem__(self, ix: int):
        word_seq, next_token = self.ls[ix]
        return list(map(self.encode, word_seq)), self.encode(next_token)
    
    def __len__(self):
        return len(self.ls)

In [91]:
@f.apply(list)
def set_proper_UNK_in_word_sequence(word_sequence):
    """Fixes the token by replacing UNK with <UNK> (that's a proper unknown token)"""
    for word in word_sequence:
        if word == 'UNK':
            yield token('<UNK>')
            continue
        yield token(word)
            
get_better_sequences_from_txt_file = f.forEach(set_proper_UNK_in_word_sequence)(get_word_sequences_from_txt_file)
# word_sequences = get_better_sequences_from_txt_file("../resources/nelson/train.txt")

generate_word_sequences_from_file = f.apply(generate_word_next_token_sequence)(get_better_sequences_from_txt_file)
build_dataset_from_file = f.apply(f.partial(LMDataset, word_encode=word_tokenizer.encode))(generate_word_sequences_from_file)
                            

In [94]:
train_dataset = build_dataset_from_file("../resources/nelson/train.txt") 
val_dataset = build_dataset_from_file("../resources/nelson/valid.txt")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41515/41515 [00:00<00:00, 99548.54it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3347/3347 [00:00<00:00, 90860.66it/s]


In [95]:
def lm_pad_collate(batch: List[
    Tuple[ List[int], int ]
]):
    out, out_len = tuple(zip(*batch))
    padded_output = pad_sequence(list(map(torch.tensor, out)), batch_first=True)
    return (padded_output, torch.tensor(out_len)) # word_sequence, next_token

# model.parameters()

In [130]:
import torch.optim as optim
import torch.nn.functional as F

from torch.nn.utils.rnn import pad_sequence

from time import time

BATCH_SIZE = 64
EPOCHS = 10

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=lm_pad_collate)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=lm_pad_collate)


In [131]:
word_tokenizer, len(word_seqs)

(TokenEncoder(<PAD>=0, <UNK>=1..., zurich=29344, l=29345), 41515)

In [132]:
import torch
from pathlib import Path
import numpy as np

from time import time

import torch.nn.functional as F

class Trainer():
    def __init__(self, model, model_name: str, checkpoint_folder_path: str):
        """Setting the configuration for training the model properly"""
        self.model = model
        self.model_name = model_name
        self.criterion = criterion
        
        checkpoint_folder_path = Path(checkpoint_folder_path)
        assert not Path(checkpoint_folder_path).exists(), "The folder exists. Delete the folder or enter a new path"
        
        # make the folder
        checkpoint_folder_path.mkdir(exist_ok=True)
        
        self.checkpoint_folder_path = str(checkpoint_folder_path)
    
    @classmethod
    def result_from_checkpoint(cls, model):
        """Reset from the checking"""
        pass
    
    def save_model_checkpoint(self, info: str):
        saved_model_path = str(Path(self.checkpoint_folder_path).joinpath("{}.{}.pth".format(self.model_name, info)))
        print("Saved! (not really):", saved_model_path)
    
    def _train_epoch_step(self, train_dataloader, criterion, optimizer, epoch=1, verbose=100):
        start_time = time()
        t_batch = len(train_dataloader)
        
        for batch_idx, _data in enumerate(train_dataloader):
            word_sequences, y = _data

            optimizer.zero_grad()

            output = self.model(word_sequences)
            loss = self.criterion(output, y)

            loss.backward()
            optimizer.step()

            if batch_idx % verbose == 0 or batch_idx + 1 == t_batch:
                print('Train Epoch: {:03d} [{:03d}/{:03d} ({:.0f}%)]\tLoss: {:.6f}\tPerplexity: {:5.2f}\telapsed: {:.2f} mins'.format(
                    epoch, (batch_idx + 1), t_batch,
                    100. * ((batch_idx + 1) / t_batch), loss.item(), np.exp(loss.item()), (time() - start_time)/60)
                )
    
    def _val_epoch_step(self, val_dataloader, criterion):
        print("-"*60)

        self.model.eval()    
        test_loss = 0
        
        with torch.no_grad():
            for I, _data in enumerate(val_dataloader):
                word_sequences, y = _data
                output = self.model(word_sequences)
                loss = torch.sum(torch.stack([self.criterion(out, y) for out,y in zip([output], y)]))
                test_loss += loss.item() / len(val_dataloader)

        print('Test set: Average loss: {:.4f} | Perplexity: {:8.2f}\n'.format(test_loss, np.exp(test_loss)))
        return round(test_loss, 4)#, self.model.state_dict()
    
    def train(self, 
                train_dataloader, 
                val_dataloader, 
                epochs=EPOCHS, 
                verbose=100,
                criterion=F.cross_entropy,
                lr=.01
             ):
        self.model.train()
        
        optimizer=optim.AdamW(model.parameters(), lr=lr)
        accum_loss = torch.tensor(float('inf'))
        start_exp_time = time()
        
        data_len = len(train_dataloader.dataset)
        t_batch = len(train_dataloader)

        for epoch in range(1, epochs + 1):
            # making the training step
            self._train_epoch_step(train_dataloader, criterion, optimizer, epoch=epoch, verbose=verbose)
            
            test_loss = self._val_epoch_step(val_dataloader, criterion)
            
            if test_loss < accum_loss:
                # Save the model
                self.save_model_checkpoint(epoch + "_loss_{:.02f}".format(test_loss))
                accum_loss = test_loss
                stop_eps = 0
            else:
                stop_eps += 1
                
            print("-"*60)
            epoch_time_elapsed = (time() - start_time)/60
            print("End of epoch {}/{} | Time elapsed: {} mins | val_loss: {}".format(epoch, epochs, epoch_time_elapsed, test_loss))
            print("-"*60)
            
            # Early stopping
            if stop_eps >= early_stop:
                # Save the model
                self.save_model_checkpoint(epoch + "_early")
                break

    def save_model_checkpoint(self):
        best_model, accum_loss = self.model_checkpoints[-1]

        if self.save_path is not None and not ModelsConfig.lang_mod:
            torch.save(best_model, Path(self.save_path).joinpath(f'{accum_loss}_SAM.pth'))
        
        if self.save_path is not None and ModelsConfig.lang_mod:
            torch.save(best_model, Path(self.save_path).joinpath(f'{accum_loss}_lm.pth'))

    def load_model_checkpoint(self, model_weights):
        self.model.load_state_dict(torch.load(Path(model_weights)))  

## Training Pipeline

In [133]:
# Actual training going on
!rm -rf ./sed_checkpoint

model = SEDLanguageModel(embeddings)
trainer = Trainer(model, "sed_language_model", "./sed_checkpoint")

In [134]:
trainer.train(train_dataloader, val_dataloader, epochs=10, verbose=100)

------------------------------------------------------------


AttributeError: 'Trainer' object has no attribute 'test_loader'

In [151]:
sample_sentence = "jumapili"
encoder_sentence = f.forEach(word_tokenizer.encode)(split_by_space)

model.eval()

with torch.no_grad():
    inp_seq = torch.tensor(encoder_sentence(sample_sentence)).unsqueeze(dim=0)
    out = model(inp_seq)
    out = F.softmax(out, dim=1)

    print(out.shape)    
    out = torch.argmax(out)
    word = word_tokenizer.decode(out.item())
    
    print(word)

torch.Size([1, 29345])
<UNK>


In [154]:
word_seqs = get_better_sequences_from_txt_file("../resources/nelson/train.txt")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41515/41515 [00:00<00:00, 42115.63it/s]


In [210]:
from collections import Counter
from itertools import chain

all_words_iter = chain.from_iterable(word_seqs)
words_occur_last_iter = tuple(word_seq[-1] for word_seq in word_seqs)

main_counter = Counter(all_words_iter)
last_token_occurance_counter = Counter(words_occur_last_iter)

In [None]:
total_word_count = sum(map(lambda x: x[1], last_token_occurance_counter.items()))

print("Token\t|\t%")
print("-"*25)
for i, c in last_token_occurance_counter.most_common(8):
    print("{}\t|\t{:.06f}".format(i, c / total_word_count))

In [None]:
last_token_occurance_counter.most_common()