<a href="https://colab.research.google.com/github/lkarjun/malayalam-language-model/blob/main/Malayalam-Language-Model/malayalam-language-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dataset Downloading

In [1]:
!pip install -qq dvc[gdrive]

!dvc get https://github.com/lkarjun/malayalam-language-model \
Datasets/

!unzip -q 'Datasets/*.zip' -d Datasets/

[K     |████████████████████████████████| 401 kB 28.5 MB/s 
[K     |████████████████████████████████| 44 kB 3.6 MB/s 
[K     |████████████████████████████████| 280 kB 63.0 MB/s 
[K     |████████████████████████████████| 49 kB 7.6 MB/s 
[K     |████████████████████████████████| 109 kB 69.8 MB/s 
[K     |████████████████████████████████| 48 kB 6.8 MB/s 
[K     |████████████████████████████████| 41 kB 54 kB/s 
[K     |████████████████████████████████| 217 kB 73.3 MB/s 
[K     |████████████████████████████████| 133 kB 63.5 MB/s 
[K     |████████████████████████████████| 180 kB 72.3 MB/s 
[K     |████████████████████████████████| 548 kB 62.6 MB/s 
[K     |████████████████████████████████| 287 kB 76.3 MB/s 
[K     |████████████████████████████████| 4.5 MB 60.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 67.0 MB/s 
[K     |████████████████████████████████| 3.6 MB 63.4 MB/s 
[K     |████████████████████████████████| 63 kB 2.4 MB/s 
[K     |██████████████████████████

## Malayalam Language Model

### Imports

In [2]:
!pip install -qq fastai==2.5.3 transformers tokenizers ohmeow-blurr

[K     |████████████████████████████████| 189 kB 31.4 MB/s 
[K     |████████████████████████████████| 3.5 MB 61.0 MB/s 
[K     |████████████████████████████████| 6.8 MB 60.0 MB/s 
[K     |████████████████████████████████| 91 kB 14.0 MB/s 
[K     |████████████████████████████████| 56 kB 5.9 MB/s 
[K     |████████████████████████████████| 895 kB 57.6 MB/s 
[K     |████████████████████████████████| 596 kB 66.1 MB/s 
[K     |████████████████████████████████| 67 kB 6.7 MB/s 
[K     |████████████████████████████████| 1.2 MB 59.3 MB/s 
[K     |████████████████████████████████| 43 kB 2.7 MB/s 
[K     |████████████████████████████████| 311 kB 63.6 MB/s 
[K     |████████████████████████████████| 243 kB 68.9 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
!huggingface-cli login

In [5]:
from fastai.text.all import *

from transformers import (BertLMHeadModel,
                          BertConfig,
                          PreTrainedTokenizerFast)

from blurr.modeling.language_modeling import (HF_LMBeforeBatchTransform, 
                                              HF_CausalLMInput, 
                                              CausalLMStrategy)

from blurr.modeling.core import (HF_BaseModelWrapper, 
                                 HF_TextBlock)

import pandas as pd
from tqdm import tqdm

DATASET = Path("/content/Datasets/")

tqdm.pandas(colour = 'red')

In [42]:
MalayalamTokenizer = PreTrainedTokenizerFast.from_pretrained("lkarjun/malayalam-language-model", 
                                                    use_auth_token=True,
                                                    )

### Dataset Loading

In [26]:
def open_text_files(f: str):
  return Path(f).read_text(encoding="utf-8")

In [8]:
files = ['article_files.csv', 'magazine_files.csv', 'wikitext_files.csv']
df = pd.concat([pd.read_csv(DATASET/f) for f in files], ignore_index = True)

In [164]:
sample = df.iloc[:1000]
len(sample)

1000

### Model Config

Default Bert Config 

( vocab_size = 30522 hidden_size = 768 num_hidden_layers = 12 num_attention_heads = 12 intermediate_size = 3072 hidden_act = 'gelu' hidden_dropout_prob = 0.1 attention_probs_dropout_prob = 0.1 max_position_embeddings = 512 type_vocab_size = 2 initializer_range = 0.02 layer_norm_eps = 1e-12 pad_token_id = 0 position_embedding_type = 'absolute'use_cache = Truec lassifier_dropout = None **kwargs )

In [126]:
CONFIG = BertConfig(
            vocab_size=MalayalamTokenizer.vocab_size,
            pad_token_id=MalayalamTokenizer.pad_token_id,
            is_decoder=True,
            name_or_path = "lkarjun/malayalam-language-model",
            )


In [127]:
LModel = BertLMHeadModel(CONFIG)

### Dataloder

In [128]:
train_bs, val_bs, train_sl, val_sl = 200, 256, 250, 300

In [129]:
splits = RandomSplitter(valid_pct=.1, seed=7)(df)
splits

((#31941) [3766,31980,26266,28238,1806,26070,5042,5295,130,25751...],
 (#3549) [26955,5036,13403,35141,30641,11352,31865,12251,2872,31166...])

In [155]:
before_batch_tfm = HF_LMBeforeBatchTransform(
                            hf_arch = None,
                            hf_config = CONFIG,
                            hf_tokenizer = MalayalamTokenizer,
                            hf_model = LModel,
                            lm_strategy_cls=CausalLMStrategy
                    )

In [157]:
block = HF_TextBlock(before_batch_tfm = before_batch_tfm, 
                     input_return_type = HF_CausalLMInput)
          

In [160]:
Mdblock = DataBlock(
          blocks = block,
          get_x = open_text_files,
          get_items = ColReader("file_path"),
          splitter = RandomSplitter(valid_pct=.1, seed=7)
      )

In [165]:
dls = Mdblock.dataloaders(sample, bs=2, seq_len = 32)

In [166]:
dls.one_batch()

({'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
  'input_ids': tensor([[    1,  8668, 60382,  ...,  7554,    19,     2],
          [    1, 12567, 47071,  ...,     3,     3,     3]], device='cuda:0'),
  'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')},)