<a href="https://colab.research.google.com/github/marutdevsharma/Applied-Data-Science-For-Beginners/blob/main/tamil_word_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# RESPONSIBLE AI SOURCE CODE LICENSE
# https://www.licenses.ai/source-code-license

Demo notebook for the medium article

## Install Pre-requisites

In [None]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install sentencepiece
!pip install gdown
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.2 MB/s[0m eta [36m0:00:0

### Imports

In [None]:
import torch
from transformers import (
    LlamaForCausalLM, LlamaConfig, LlamaTokenizer,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from datasets import load_dataset
import sentencepiece as spm
import os
import logging
import json
import sys
import argparse

logging.basicConfig(level=logging.INFO)


### Language Tokens

We settle for a character tokenizer. Typically this will have subwords for LLM training.

In [None]:
USER_DEFINED_SYMBOLS = ["<pad>", "<s>", "</s>", "<mask>", "."]
symbols = USER_DEFINED_SYMBOLS

vowels = [
    "அ", "ஆ", "இ", "ஈ", "உ", "ஊ", "எ", "ஏ", "ஐ", "ஒ", "ஓ", "ஔ", "ஃ"
]
consonants = [
    "க", "ங", "ச", "ஞ", "ட", "ண", "த", "ந", "ப", "ம", "ய", "ர", "ற",
    "ன", "ல", "ள", "ழ", "வ", "ஷ", "ஸ", "ஹ", "க்ஷ", "ஜ", "ஶ", "ஸ்ரீ"
]
dependent_vowels = [
    "ா", "ி", "ீ", "ு", "ூ", "ெ", "ே", "ை", "ொ", "ோ", "ௌ", "்"
]
symbols.extend(vowels + consonants + dependent_vowels)
print(symbols)

['<pad>', '<s>', '</s>', '<mask>', '.', 'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ', 'ஃ', 'க', 'ங', 'ச', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ப', 'ம', 'ய', 'ர', 'ற', 'ன', 'ல', 'ள', 'ழ', 'வ', 'ஷ', 'ஸ', 'ஹ', 'க்ஷ', 'ஜ', 'ஶ', 'ஸ்ரீ', 'ா', 'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ௌ', '்']


Copy the names dataset from my drive to local

In [None]:
os.makedirs("data", exist_ok=True)
%cd ./data
!gdown 1MKv9Ne3SXPPyvJGwg3dCjMJQ_gAxmged
%ls
!head -10 baby_names.txt
%cd ..
%pwd

/content/data
Downloading...
From: https://drive.google.com/uc?id=1MKv9Ne3SXPPyvJGwg3dCjMJQ_gAxmged
To: /content/data/baby_names.txt
100% 329k/329k [00:00<00:00, 85.5MB/s]
baby_names.txt
பெண்,அகரயாழினி.
பெண்,அகநகை.
பெண்,அகல்.
பெண்,அகல்நிலா.
பெண்,அகல்விழி.
பெண்,அகவழகி.
பெண்,அங்கவை.
பெண்,அங்கயற்கண்ணி.
பெண்,அஞ்சம்மாள்.
பெண்,அஞ்சலை.
/content


'/content'

Generating the tokenizer model and model's config.json. At the end of this execution, a new folders with name "names" will be created with config.json and tokenizer.model within it. Now thats our model template.

In [None]:
def train_tokenizer(language, input_path, model_prefix, vocab_size):
    spm.SentencePieceTrainer.train(
        input=input_path,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        user_defined_symbols=language,
        model_type="BPE"
    )

def move_tokenizer_to_folder(source, destination_folder):
    os.rename(source, os.path.join(destination_folder, "tokenizer.model"))

def create_config_file(folder_path, content):
    with open(os.path.join(folder_path, "config.json"), "w") as config_file:
        json.dump(content, config_file, indent=4)

config_content = {
    "_name_or_path": "./names_1m",
    "architectures": [
        "LlamaForCausalLM"
    ],
    "bos_token_id": 2,
    "eos_token_id": 3,
    "hidden_act": "silu",
    "hidden_size": 64,
    "initializer_range": 0.02,
    "intermediate_size": 180,
    "max_position_embeddings": 32,
    "model_type": "llama",
    "num_attention_heads": 16,
    "num_hidden_layers": 8,
    "num_key_value_heads": 16,
    "pad_token_id": 1,
    "pretraining_tp": 1,
    "rms_norm_eps": 1e-06,
    "rope_scaling": None,
    "tie_word_embeddings": False,
    "torch_dtype": "float32",
    "transformers_version": "4.28.1",
    "use_cache": False,
    "vocab_size": 58
}

out_folder_path = "names"
os.makedirs(out_folder_path, exist_ok=True)
create_config_file(out_folder_path, config_content)
train_tokenizer(symbols, './data/baby_names.txt', 'tokenizer', 58)
move_tokenizer_to_folder("tokenizer.model", out_folder_path)

tokenizer = LlamaTokenizer.from_pretrained(out_folder_path)
tokenizer.pad_token = tokenizer.eos_token

sample_sentence = "தமிழ் வாழ்க"
tokens = tokenizer(
                sample_sentence, truncation=True,
                padding='max_length', max_length=16)
print(f"Original Sentence: {sample_sentence}\nTokenized Sentence: {tokens}")



You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Original Sentence: தமிழ் வாழ்க
Tokenized Sentence: {'input_ids': [1, 56, 25, 28, 45, 35, 55, 56, 36, 44, 35, 55, 19, 2, 2, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}


In [None]:
tokenizer.decode([1,2,3,4,5,6,7,8,9,10])

'<s></s><pad><mask>.அஆஇஈஉ'

Create a new LLaMA architecture model from the config file. Split the names dataset into train & test splits. The Transformers trainer setup

In [None]:
def create_config_model(path):
    config = LlamaConfig.from_pretrained(path)

    model = LlamaForCausalLM(config)
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model_size = sum(t.numel() for t in model.parameters())
    print(f"GPT Model size: {model_size/1000**2:.1f}M parameters")
    return model


def create_tokenized_dataset_splits(path, tokenizer, block_size):
    dataset = load_dataset('text', data_files=path)
    shuffled_dataset = dataset['train'].shuffle(seed=42)
    split_datasets = shuffled_dataset.train_test_split(test_size=0.2)

    def tokenize_dataset(dataset):
        return dataset.map(
            lambda examples: tokenizer(
                examples['text'], truncation=True,
                padding='max_length', max_length=block_size
            ),
            batched=True
        )

    def unique_name_set(dataset):
      names_set = set()

      for example in dataset:
          name = example['text'].split(".")[0]
          names_set.add(name)

      return names_set

    return tokenize_dataset(split_datasets['train']), tokenize_dataset(split_datasets['test']), unique_name_set(split_datasets['train'])

def train_model(model, tokenizer, train_dataset, test_dataset, out_folder_path):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=out_folder_path,
        overwrite_output_dir=True,
        num_train_epochs=100,
        per_device_train_batch_size=8,
        save_steps=10000,
        logging_steps=10,
        eval_steps=1000,
        logging_dir=f'{out_folder_path}/logs',
        evaluation_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)]
    )

    trainer.train()
    model.save_pretrained(out_folder_path)



Start the training

In [None]:

model = create_config_model(out_folder_path)
train_dataset, test_dataset, unique_names = create_tokenized_dataset_splits('data/baby_names.txt', tokenizer, block_size=32)
train_model(model, tokenizer, train_dataset, test_dataset, out_folder_path)

print("Training completed.")

GPT Model size: 0.4M parameters


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/6528 [00:00<?, ? examples/s]

Map:   0%|          | 0/1633 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
1000,2.1215,2.128783
2000,1.8012,1.776574
3000,1.6247,1.607673
4000,1.5328,1.495318
5000,1.4057,1.434254
6000,1.3508,1.387201
7000,1.2843,1.355279
8000,1.2315,1.327689
9000,1.2357,1.316398
10000,1.1302,1.308074


Training completed.


Use the trained model and generate 10 Unique names in both categories. The names are unique in a way that they are not even present in the training dataset.

In [None]:
def generate_names(model, tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)
    generated_names = set()

    with torch.no_grad():
        while len(generated_names) < 20:
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=32,
                early_stopping=True,
                temperature=0.6,
                top_p=0.8,
                top_k=50,
                do_sample=True,
                output_scores=True,
                pad_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.4,
                eos_token_id=tokenizer.eos_token_id
            )
            output_str = tokenizer.decode(output[0], skip_special_tokens=True).split(".")[0]
            if output_str not in generated_names and output_str not in unique_names:
                print(output_str)
                generated_names.add(output_str)

male_names_prompt = "ஆண்,"
female_names_prompt = "பெண்,"

model.eval()
generate_names(model, tokenizer, male_names_prompt)
generate_names(model, tokenizer, female_names_prompt)



ஆண்,செந்தில்கொடையன்
ஆண்,கலைச்சிவன்
ஆண்,செந்தில்முரையன்
ஆண்,தமிழ்க்குரியன்
ஆண்,தமிழ்க்குரி
ஆண்,கலைத்தாயன்
ஆண்,தமிழ்க்குன்றல்
ஆண்,அருள்செல்வன்
ஆண்,பாலகுமாரன்
ஆண்,கலையருளன்
ஆண்,தமிழருவேலன்
ஆண்,திருவேலன்
ஆண்,செம்மணியன்
ஆண்,திருமாவளன்
ஆண்,தமிழ்க்குமரன்
ஆண்,செந்தில்மணி
ஆண்,திருவாயகன்
ஆண்,அம்புச்செல்வன்
ஆண்,அருளி
ஆண்,அமுதி
பெண்,தேனிசைமாமகள்
பெண்,காருதிலா
பெண்,திருவாயி
பெண்,காவிரியம்மை
பெண்,தேனிசைமா
பெண்,முத்துக்கோவி
பெண்,காரியம்மை
பெண்,தமிழரசு
பெண்,முத்தாயகி
பெண்,அருள்நேயம்
பெண்,செம்மலர்க்குமதி
பெண்,திருமதி
பெண்,திருவாய்மதி
பெண்,கார்முத்து
பெண்,சித்திரைநேயம்
பெண்,தேவமுரசு
பெண்,முத்தாயிர்
பெண்,அமினா
பெண்,காமிகா
பெண்,மாலிகா
