<a href="https://colab.research.google.com/github/lkarjun/malayalam-language-model/blob/main/Malayalam-Language-Model/malayalam-language-model_awd_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Malayalam Language Model AWD-LSTM

### Imports

In [4]:
!pip install -qq fastai==2.5.3 sentencepiece transformers datasets

In [2]:
from fastai.text.all import *
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset

In [3]:
dset = load_dataset("lkarjun/Malayalam-Articles")

### Load datasets

In [40]:
def get_sample(seed = None, size = 4000):
    if seed:
        return dset['train'].to_pandas().sample(size, random_seed = seed).dropna()
    return dset['train'].to_pandas().sample(size).dropna()


In [7]:
train = get_sample()
train.shape

### Tokenizer

In [10]:
pre_tokenizer = PreTrainedTokenizerFast.from_pretrained("lkarjun/malayalam-language-model")

In [11]:
def tokenize(text):
  assert type(text) == str, "check the input type"
  toks = pre_tokenizer.tokenize(text)
  toks = ['[CLS]'] + toks + ['[SEP]']
  toks = pre_tokenizer.convert_tokens_to_ids(toks)
  return tensor(toks)

In [12]:
class MlTokenizer(Transform):
    def __init__(self, tokenizer): 
      self.tokenizer = tokenizer
    def encodes(self, x): 
        return x if isinstance(x, Tensor) else tokenize(x)
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

In [13]:
tokenizer = MlTokenizer(pre_tokenizer)
vocab = [x for x, y in sorted(pre_tokenizer.vocab.items(), key = lambda i: i[1])]

### Dataloder

In [48]:
def get_dls(seed = None, size = 4000, bs=72, seq_len = 120):
    train = get_sample(seed, size)
    tokenized = [tokenize(t) for t in progress_bar(train['content'])]
    splits = RandomSplitter(.2, 7)(train)
    tfms = TfmdLists(tokenized, 
                 tokenizer, 
                 splits=splits, 
                 dl_type=LMDataLoader)
    dls = tfms.dataloaders(bs=bs, seq_len=seq_len)
    dls.vocab = vocab
    return dls

In [19]:
mldls = get_dls()

In [21]:
mldls.show_batch(max_n = 4)

### Traning Language Model

In [22]:
x, y = mldls.one_batch()
x.shape, y.shape

In [23]:
doc(language_model_learner)

In [24]:
learn = language_model_learner(
            mldls, 
            AWD_LSTM,
            drop_mult=.3,
            pretrained = False,
            metrics = Perplexity()
            ).to_fp16()

In [54]:
learn.lr_find(suggest_funcs=[minimum, steep, valley, slide])

In [None]:
for _ in range(5):
    learn.dls = get_dls(size=10000, bs = 100, seq_len = 125)
    rst = learn.lr_find(suggest_funcs=[minimum, steep, valley, slide])
    for __ in range(4):
        learn.fit(2, rst.valley)