In [251]:
import sys
if "../" not in sys.path: sys.path.insert(0,"../");

In [252]:
# Training for the SED
from marynlp.modules.source import save_model_from_google_bucket
from marynlp.utils.storage import download, get_bucket


# Download the morphmeme file
bucket = get_bucket("../resources/mary_africa_credentials_key.json", "marynlp-private")
morph_template_file = save_model_from_google_bucket("models/sed_morpheme_template.txt", bucket)


In [253]:
morph_template_file

PosixPath('/home/iam-kevin/.marynlp/store/models/sed_morpheme_template.txt')

In [254]:
# Prepare the words
from marynlp import funcutils as f
from tqdm import tqdm

from typing import List, Union
import os
import re

def split_by_space(text: str) -> List[str]:
    """Split a text to word strings
    
    Example
    "Lorem ipsum" -> [ 'Lorem', 'ipsum' ] 
    """
    return re.split(r"\s+", text)

def ignore_rules(text: str) -> bool:
    return not (text.find("<text") > -1 or text.find("</text>") > -1)

def should_be_longer_than_20(text: str) -> bool:
    return len(text) > 20

@f.filterBy(f.rules(ignore_rules, should_be_longer_than_20))
def read_file(file_path) -> List[str]:
    with open(file_path, "r", encoding="utf-8") as rb:
        return rb.readlines()

@f.forEach(str, type_=set)
@f.filterBy(lambda s: len(s.strip()) > 0)
@f.flowBy(split_by_space)
@f.forEach(lambda s: s.strip())
def get_unique_words_from_shu_file(file_path: os.PathLike):
    return tqdm(read_file(file_path))


In [268]:
from torch.utils.data import Dataset
from typing import Iterable, Tuple

from experimental.sed import MorphologyAnalyzer
from marynlp import funcutils as f 

from collections.abc import Callable

def break_word(word: str, analyzer: MorphologyAnalyzer) -> Tuple[str]:
    return tuple([ su for su in analyzer.break_text([word])[word]])

# Might not want to add this since it's pytorch specific?
class MorphemeDataset(Dataset):
    def __init__(self, word_iter: Iterable[str], break_word: Callable):
        self.break_word = break_word
        self.wml = list(word_iter)
        
    def __getitem__(self, index: int):
        return self.break_word(self.wml[index])
    
    def __len__(self):
        return len(self.wml)

In [256]:
# unique_morphemes

In [273]:
from marynlp.text.data import Vocab, morph
from marynlp.text import formatter as fmt
from itertools import chain
# from marynlp.utils import process as p

unique_words = get_unique_words_from_shu_file("../resources/data/hcs-na-v2/new-mat/news/alasiri-2009.shu")
analyzer = MorphologyAnalyzer(morph_template_file)

# start with remove punctuations
clean_text = f.calls(fmt.remove_punctuations, fmt.white_space_cleaning, fmt.lowercase)

@f.apply(f.calls(chain.from_iterable, set))
def get_unique_morphemes(words: Iterable[str]):
    breaker = f.calls(clean_text, f.partial(break_word, analyzer=analyzer))
    for w in words:
        try:
            yield breaker(w)
        except Exception as e:
            print("Unable to break word: '%s'" % w)
            raise e

            
# unique_morphemes = get_unique_morphemes(unique_words)
# morpheme_vocab = Vocab(unique_morphemes)
# word_vocab = Vocab(map(clean_text, unique_words))
# dataset = MorphemeDataset(unique_words, break_word=f.partial(break_word, analyzer=analyzer))
# clean_text("sisi?,!")

100%|███████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 269003.59it/s]


In [270]:
analyzer_break_word = f.partial(break_word, analyzer=analyzer)

m_l = analyzer_break_word("kevin james")
m_l, list(map(morph_encoder.encode, m_l))

(('e', 'vin', 'jam', 'e', 's'), [101, -1, 217, 101, 434])

In [271]:
#  = f.partial(break_word, analyzer=analyzer)
# get_unique_morphemes("kevin james")

('e', 'vin', 'jam', 'e', 's')

In [272]:
# NOTE: This must be updated the data
#  in accordance to the data that is used to train the data

word_vocab, morpheme_vocab # Vocab([ , 0..., zwa], len=569)

(Vocab([alikuwa, anafundisha..., na], len=6), Vocab([ , 0..., zwa], len=569))

In [262]:
from experimental.sed.nn import SEDLanguageModel
from experimental.sed.modules.embeddings import SEDWordEmbeddings

swe = SEDWordEmbeddings(
    morpheme_vocab.size,
    embedding_dim=100,
    hidden_dim=32
)

def lm_pad_collate(batch):
    x, x_len = [], []
    
    x_len = [m.shape[0] for m in batch]
    max_len = max(x_len)
    
    x_batch_ = [np.pad(t, (0, max_len - t.shape[0])) if t.shape[0] < max_len else t for t in batch]
    xx = torch.stack([torch.from_numpy(x).long() for x in x_batch_])

    return xx, torch.as_tensor(x_len)

In [339]:
from torch.utils.data import DataLoader
import re
import numpy as np
import torch

from marynlp.text.data import token
from marynlp.text.processor import TokenEncoder

from marynlp.text import formatter as fmt
from itertools import chain

from typing import List, Any


def split_text_by_space(text: str) -> Tuple[str]:
    return re.split(r'\s+', fmt.white_space_cleaning(text)) 


@f.apply(list)
def text_pad_sequence(word_sequence: Iterable[Any], padding_length: int, pad_marker: Any):
    if len(word_sequence) >= padding_length:
        return word_sequence
    
    return chain.from_iterable((word_sequence, [pad_marker] * (padding_length - len(word_sequence))))

@f.apply(tuple)
@f.apply(lambda o: zip(*o))
def width_pad_sequence(var_sequences: Iterable[List[Any]], padding_length: int, pad_marker: Any):
    for var_seq in var_sequences:
        out = tuple(var_seq)
        yield tuple(chain.from_iterable((out, [pad_marker] * (padding_length - len(var_seq))))), len(out)

        
PAD_TOKEN = token('<PAD>')


morph_encoder = TokenEncoder(morpheme_vocab)
word_to_morpheme = f.partial(break_word, analyzer=analyzer)
encode_morpheme = f.apply(np.array)(f.forEach(morph_encoder.encode)(word_to_morpheme))


# def pad_collateV2(batch):
#     """NOTE: This is the modified version from the package"""
#     x, x_len = [], []
    
#     x_len = [m.shape[0] for m in batch]
#     max_len = max(x_len)
    
#     x_batch_ = [np.pad(t, (0, max_len - t.shape[0])) if t.shape[0] < max_len else t for t in batch]
#     xx = torch.stack([torch.from_numpy(x).long() for x in x_batch_])

#     return xx, torch.as_tensor(x_len)



# @f.apply(list)
def morph_sentence_pad_collate(sentences: List[str], padding_idx: int = 0):
    word_sequences = list(map(split_text_by_space, sentences))
#     print(word_sequences)
    
    # padd each sequence
    out_list = [list(map(word_to_morpheme, b)) for b in word_sequences]
    print(out_list)

    # get the longest character sequence in the entire batch
    to_pad_morph_length = len(max(list(map(f.partial(max, key=len), out_list)), key=len))
    to_pad_word_length = max(map(len, out_list))

    morph_padd_function = f.partial(width_pad_sequence, padding_length=to_pad_morph_length, pad_marker=padding_idx)
    
#         print("m_idx_seq", padd_function(m_idx_seq))
    
    # hold the number of words and morphme pair count
    word_morphemes_count = []
    words_tensors = []
    words_c = []  # number of words in a sentence

    for b in out_list:
        word_list = [ list(map(morph_encoder.encode, b_)) for b_ in b ]
#         print(word_list)
        
        padded_morph_sequence, lengths = morph_padd_function(word_list)
        
#         c, _ = morpheme_pad_collate(list(map(encode_morpheme, b)))
#         word_count, longest_token_len = c.shape[0], c.shape[-1]

        # grid padding
        word_count = len(word_list)
        padd_words_count = to_pad_word_length - word_count
        out = (tuple([0] * to_pad_morph_length) for _ in range(padd_words_count))    
        
        words_tensors.append(np.array(list(chain(padded_morph_sequence, out))))
#         word_morphemes_count.append(np.array(tuple(chain(lengths, [0] * padd_words_count))))
        word_morphemes_count.append(
            tuple(chain(
                lengths, 
                [0] * padd_words_count # padding
            )))
    
        words_c.append(word_count)

    padded_length_tensors = np.array(word_morphemes_count)
#         # get the remaining shape
# #         print(c)
#     print(out)
                             
    # padd the sentence objects
#     padded_length_tensors = pad_sequence(word_morphemes_count, batch_first=True)

    return np.array(words_tensors), np.array(padded_length_tensors), np.array(words_c)

# def padded_input(pad_idx: int, padding_length: int):
#     return np.array([pad_idx] * padding_length)


# t, l = morpheme_pad_collate(list(MorphemeDataset(["mama", "anakuja"], encode_morpheme))); t

# split sentence
# lm = DataLoader()
sentences = ["mama anakuja na mama", "mwalimu alikuwa anafundisha"]
t, mcs, wc = morph_sentence_pad_collate(sentences); 
t.shape, mcs.shape, wc.shape

# set_pad = 
# print(set_pad)
t[1], mcs[1], wc[1]

[[('ma', 'ma'), ('anak',), ('n', 'a'), ('ma', 'ma')], [('m', 'wali', 'mu'), ('ali', 'wa'), ('ana', 'fu', 'ndisha')]]


(array([[ 5, 11,  7],
        [ 1, 10,  0],
        [ 2,  4,  9],
        [ 0,  0,  0]]),
 array([3, 2, 3, 0]),
 3)

In [292]:
morpheme_pad_collateV2([np.array([3, 4, 5]), np.array([3, 2])])

(tensor([[3, 4, 5],
         [3, 2, 0]]),
 tensor([3, 2]))

### Showing Nelson how its done

In [320]:
from marynlp.text.processor import TokenEncoder
from marynlp.text.data import Vocab

from itertools import chain

sentences = ["mama anakuja na mama", "mwalimu alikuwa anafundisha"]

unique_words = set(list(chain.from_iterable(list(map(split_text_by_space, sentences)))))
print(unique_words)
unique_morphemes = get_unique_morphemes(unique_words)

print(unique_morphemes)
word_vocab = Vocab(unique_words)
morpheme_vocab = Vocab(unique_morphemes)

word_tokenizer = TokenEncoder(word_vocab)
morpheme_tokenizer = TokenEncoder(morpheme_vocab)


{'alikuwa', 'mwalimu', 'anafundisha', 'na', 'anakuja', 'mama'}
{'ndisha', 'ali', 'ana', 'a', 'wali', 'n', 'mu', 'm', 'anak', 'wa', 'ma', 'fu'}


In [321]:
# Fit the embeddings
from experimental.sed.modules.embeddings import SEDWordEmbeddings
swe = SEDWordEmbeddings(
    morpheme_vocab.size,
    embedding_dim=100,
    hidden_dim=32
)

word_to_morpheme = f.partial(break_word, analyzer=analyzer)
encode_morpheme = f.apply(np.array)(f.forEach(morph_encoder.encode)(word_to_morpheme))
morpheme_dataset = MorphemeDataset(word_vocab.get_tokens(), break_word=encode_morpheme)

In [353]:

# for i in morpheme_dataset:
#     print(i)
embeddings = swe.fit(morpheme_dataset)
embeddings = torch.tensor(embeddings)

100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.85it/s]


In [354]:
embeddings.shape, word_vocab


(torch.Size([6, 100]), Vocab([alikuwa, anafundisha..., na], len=6))

In [96]:
from torch.nn.utils.rnn import pad_sequence


pad_sequence(
    [
        torch.tensor([
            [307, 307],
            [ 40,   0],
            [342,  11],
            [307, 307]
        ]), 
        torch.tensor([[2, 1, 2, 2]]),        
    ]
)

RuntimeError: The size of tensor a (2) must match the size of tensor b (4) at non-singleton dimension 1

In [438]:
# Training the LM like a boss
# TODO: padding_idx is 0 and first token index is 0: FIX THIS!!
from torch.nn.utils.rnn import pad_sequence
word_encoded = [torch.tensor(list(map(word_tokenizer.encode, words))) for words in map(split_by_space, sentences)]; word_encoded
# word_encoded = [np.array(list(map(word_tokenizer.encode, words))) for words in map(split_by_space, sentences)]; word_encoded

print(word_encoded)
# pad_sequence(word_encoded, batch_first=True)
seq = pad_sequence(word_encoded, batch_first=True)

[tensor([3, 2, 5, 3]), tensor([4, 0, 1])]


tensor([[3, 2, 5, 3],
        [4, 0, 1, 0]])

In [427]:
embeddings.shape[-1]

100

In [424]:
seq_layer = SEDSequenceLayer(
                word_vocab.size, 
                embedding_dim=embeddings.shape[-1], 
                comp_fn=None
            )

batched_emb_sequence = torch.matmul(torch.nn.functional.one_hot(seq).to(torch.float), embeddings)
batched_emb_sequence.shape

out = seq_layer.predict_proba(batched_emb_sequence)
torch.argmax(out, dim=1), out

tensor([[3, 2, 5, 3],
        [4, 0, 1, 0]])


(tensor([1, 1]),
 tensor([[0.1621, 0.1968, 0.1646, 0.1567, 0.1579, 0.1618],
         [0.1711, 0.1890, 0.1596, 0.1528, 0.1631, 0.1644]],
        grad_fn=<SoftmaxBackward>))

In [391]:

# one_hot = torch.nn.functional.one_hot(torch.tensor([3, 2, 5, 3])); one_hot
one_hot = torch.nn.functional.one_hot(seq); 
one_hot.shape, embeddings.unsqueeze(dim=0).shape

(torch.Size([2, 4, 6]), torch.Size([1, 6, 100]))

In [405]:
# %timeit torch.matmul(torch.nn.functional.one_hot(seq).to(torch.float), embeddings).shape
v2 = torch.matmul(torch.nn.functional.one_hot(seq).to(torch.float), embeddings)

In [406]:
# %timeit torch.tensor(np.array([np.array([embeddings[b].numpy() for b in g]) for g in seq])).shape
v1 = torch.tensor(np.array([np.array([embeddings[b].numpy() for b in g]) for g in seq]))

tensor([[[True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True],
         [True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, T

In [71]:
## Install Pytorch Lightning kama bado
# !pip install pytorch-lightning
import torch

# from pytorch_lightning import Trainer
# from experimental.sed.nn import SEDWordEmbeddingLayer

# NOTE: I think When training and changing the model to training mode (`.train()`),
# all the models weights are unfrozen (so it )
sed_lang_model = SEDLanguageModel(
                    # This is important to get the same model shape
                    swe.swe_layer,
                    word_count=word_vocab.size
                 )


sed_lang_model(
    torch.tensor([[[307, 307],
        [ 40,   0],
        [342,  11],
        [307, 307]]]),
    torch.tensor([[4, 2]])
)


torch.Size([1, 4, 2]) torch.Size([1, 2])


IndexError: index_select(): Index is supposed to be a vector

## Training Pipeline

In [None]:
class Trainer():
    def __init__(self):
        """Setting the configuration for training the model properly"""
        pass
    
    @classmethod
    def result_from_checkpoint(cls, model):
        """Reset from the checking"""
        pass
    
    def train(self, model, train_dataloader, val_dataloader, test_dataloader):
        pass


class Pipeline(object):
    def __init__(self, train_data, test_data, model, save_path=None, resume_from_checkpoint=False, model_weights=None):
        
        self.model = model            
        self.resume_train = resume_from_checkpoint
        self.model_weights = model_weights

        if self.resume_train:
            assert model_weights is not None, "specify model weights save location"
            self.load_model_checkpoint(self.model_weights)

        if ModelsConfig.lang_mod:
            DataConfig.dataloader_params['collate_fn'] = lambda x: lm_pad_collate(
                x, 
                lang_mod=True,
                frozen_embeddings=EmbeddingsConfig.freeze_embeddings,
                )
        
        self.train_loader = DataLoader(train_data, **DataConfig.dataloader_params)
        self.test_loader = DataLoader(test_data, **DataConfig.dataloader_params)

        self.criterion = F.cross_entropy
        self.optimizer = optim.AdamW(self.model.parameters(), lr=ModelsConfig.learning_rate)
        
        self.iter_meter = IterMeter()
        self.save_path = save_path
        self.model_checkpoints = []

    def train(self, epoch, verbose):
        self.model.train()
        
        start_time = time()
        data_len = len(self.train_loader.dataset)

        for batch_idx, _data in enumerate(self.train_loader):
            x, x_len, y = _data 

            y = [y_.to(ModelsConfig.device) for y_ in [y]]
            y = [y_.reshape(y_.shape[0],) for y_ in y]

            self.optimizer.zero_grad()

            output = self.model((x, x_len))
            loss = self.criterion(output, y[0])
            loss.backward()

            self.optimizer.step()
            if batch_idx % verbose == 0 or batch_idx == data_len:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tPerplexity: {:5.2f}\telapsed: {:.2f} mins'.format(
                    epoch, batch_idx * len(x), data_len,
                    100. * batch_idx / len(self.train_loader), loss.item(), np.exp(loss.item()), (time()-start_time)/60))#, loss2.item()))\tap_Loss: {:.6f}
                
    def test(self, epoch):
        print('\nevaluating…')
        self.model.eval()
        test_loss = 0
        with torch.no_grad():
            for I, _data in enumerate(self.test_loader):
                x, x_len, y = _data  

                y = [y_.to(ModelsConfig.device) for y_ in [y]]
                y = [y_.reshape(y_.shape[0],) for y_ in y]

                output = self.model((x, x_len))
                loss = torch.sum(torch.stack([self.criterion(out, y) for out,y in zip([output], y)]))
                test_loss += loss.item() / len(self.test_loader)

        print('Test set: Average loss: {:.4f} | Perplexity: {:8.2f}\n'.format(test_loss, np.exp(test_loss)))

        return round(test_loss, 4), self.model.state_dict()
        
    def train_model(self, early_stop=3, verbose=10):
        
        accum_loss = torch.tensor(float('inf')) if not self.resume_train else torch.tensor(float(re.findall("\d+\.\d+", self.model_weights)[0]))
        stop_eps = 0 
        try:
            for epoch in range(1, ModelsConfig.EPOCHS + 1):
                self.train(epoch, verbose)             
                test_loss, w8 = self.test(epoch)
                
                if test_loss < accum_loss:
                    self.model_checkpoints.append((w8, test_loss))
                    accum_loss = test_loss
                    stop_eps = 0
                else:
                    stop_eps += 1

                if stop_eps >= early_stop:
                    self.save_model_checkpoint()
                    break

        except KeyboardInterrupt:
            self.save_model_checkpoint() if self.model_checkpoints else print("first epoch not completed, model checkpoint will not be saved")

    def save_model_checkpoint(self):
        best_model, accum_loss = self.model_checkpoints[-1]

        if self.save_path is not None and not ModelsConfig.lang_mod:
            torch.save(best_model, Path(self.save_path).joinpath(f'{accum_loss}_SAM.pth'))
        
        if self.save_path is not None and ModelsConfig.lang_mod:
            torch.save(best_model, Path(self.save_path).joinpath(f'{accum_loss}_lm.pth'))

    def load_model_checkpoint(self, model_weights):
        self.model.load_state_dict(torch.load(Path(model_weights)))     

In [None]:
# Training pipeline
from 

swl = SEDLanguageModel(swe.swe_layer, word_count=word_vocab.size)
swl.train()
        
start_time = time()
data_len = len(self.train_loader.dataset)

for batch_idx, _data in enumerate(self.train_loader):
    x, x_len, y = _data 

    y = [y_.to(ModelsConfig.device) for y_ in [y]]
    y = [y_.reshape(y_.shape[0],) for y_ in y]

    self.optimizer.zero_grad()

    output = self.model((x, x_len))
    loss = self.criterion(output, y[0])
    loss.backward()

    self.optimizer.step()
    if batch_idx % verbose == 0 or batch_idx == data_len:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tPerplexity: {:5.2f}\telapsed: {:.2f} mins'.format(
            epoch, batch_idx * len(x), data_len,
            100. * batch_idx / len(self.train_loader), loss.item(), np.exp(loss.item()), (time()-start_time)/60))#, loss2.item()))\tap_Loss: {:.6f}
                