In [None]:
import os
import sys
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer.utils import preprocess_batch, postprocess_batch
from IndicTransTokenizer.tokenizer import IndicTransTokenizer

en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B" # ai4bharat/indictrans2-en-indic-dist-200M
indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B" # ai4bharat/indictrans2-indic-en-dist-200M
indic_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-1B"   # ai4bharat/indictrans2-indic-indic-dist-320M
BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

if len(sys.argv) > 1:
    quantization = sys.argv[1]
else:
    quantization = ""

In [2]:
def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = IndicTransTokenizer(direction=direction)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig
    )
    
    if qconfig==None:
        model = model.to(DEVICE)
        model.half()
    
    model.eval()
    
    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch, entity_map = preprocess_batch(
            batch, src_lang=src_lang, tgt_lang=tgt_lang
        )

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            src=True,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                do_sample=True,
                temperature=0.2,
                top_p=0.95,
                top_k=50,
                repetition_penalty=1.2,
            )

        # Decode the generated tokens into text
        generated_tokens = tokenizer.batch_decode(
            generated_tokens.detach().cpu().tolist(), src=False
        )

        # Postprocess the translations, including entity replacement
        translations += postprocess_batch(
            generated_tokens, lang=tgt_lang, placeholder_entity_map=entity_map
        )

        del inputs
        torch.cuda.empty_cache()

    return translations

In [None]:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B" # ai4bharat/indictrans2-en-indic-dist-200M
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(
    en_indic_ckpt_dir, "en-indic", ""
)


In [4]:
def chunk_and_translate(content):
    minibatch = content.split(".")
    minibatch = [k for k in minibatch if len(k.strip())>0]
    translations = batch_translate(minibatch, "eng_Latn", "hin_Deva", en_indic_model, en_indic_tokenizer)
    translated_content = " ".join(translations)
    return translated_content

In [5]:
from datasets import load_dataset
def preprocess(sample):
    for turn in sample["messages"]:
        turn["content"] = chunk_and_translate(turn["content"])
            
    return {"messages": sample["messages"]}


dataset = load_dataset("HuggingFaceH4/no_robots")
dataset
dataset = dataset.map(
    preprocess,
    batched=False
)
dataset
# dataset["train"] = dataset["train_sft"]
# dataset["test"] = dataset["test_sft"]
# del(dataset["train_sft"])
# del(dataset["test_sft"])
# print(dataset)
# print(dataset["train"][0])

Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category'],
        num_rows: 9500
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category'],
        num_rows: 500
    })
})

In [6]:
dataset

DatasetDict({
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category'],
        num_rows: 9500
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category'],
        num_rows: 500
    })
})

In [9]:
dataset["train_sft"][0]

{'prompt': 'Please summarize the goals for scientists in this text:\n\nWithin three days, the intertwined cup nest of grasses was complete, featuring a canopy of overhanging grasses to conceal it. And decades later, it served as Rinkert’s portal to the past inside the California Academy of Sciences. Information gleaned from such nests, woven long ago from species in plant communities called transitional habitat, could help restore the shoreline in the future. Transitional habitat has nearly disappeared from the San Francisco Bay, and scientists need a clearer picture of its original species composition—which was never properly documented. With that insight, conservation research groups like the San Francisco Bay Bird Observatory can help guide best practices when restoring the native habitat that has long served as critical refuge for imperiled birds and animals as adjacent marshes flood more with rising sea levels. “We can’t ask restoration ecologists to plant nonnative species or to 

In [10]:
dataset.push_to_hub("hindi_instruct_v0", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
# original prompt with suffix. Respond in Hindi for tasks summarization, Rewrite, Generation, 
dataset

DatasetDict({
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category'],
        num_rows: 9500
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category'],
        num_rows: 500
    })
})

In [134]:
orig_dataset = load_dataset("HuggingFaceH4/no_robots")
orig_dataset

DatasetDict({
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category'],
        num_rows: 9500
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category'],
        num_rows: 500
    })
})

In [135]:
set(orig_dataset["train_sft"]["category"])

{'Brainstorm',
 'Chat',
 'Classify',
 'Closed QA',
 'Coding',
 'Extract',
 'Generation',
 'Open QA',
 'Rewrite',
 'Summarize'}

In [136]:
for split in ["train_sft", "test_sft"]:
    orig_dataset[split] = orig_dataset[split].add_column(name="hindi_messages", column=dataset[split]["messages"])

In [137]:
orig_dataset["train_sft"][0]

{'prompt': 'Please summarize the goals for scientists in this text:\n\nWithin three days, the intertwined cup nest of grasses was complete, featuring a canopy of overhanging grasses to conceal it. And decades later, it served as Rinkert’s portal to the past inside the California Academy of Sciences. Information gleaned from such nests, woven long ago from species in plant communities called transitional habitat, could help restore the shoreline in the future. Transitional habitat has nearly disappeared from the San Francisco Bay, and scientists need a clearer picture of its original species composition—which was never properly documented. With that insight, conservation research groups like the San Francisco Bay Bird Observatory can help guide best practices when restoring the native habitat that has long served as critical refuge for imperiled birds and animals as adjacent marshes flood more with rising sea levels. “We can’t ask restoration ecologists to plant nonnative species or to 

In [138]:
# Remove coding examples
# from datasets import concatenate_datasets
# def code_preprocess(sample):
#     for i, message_tuple in enumerate(zip(sample["messages"], sample["hindi_messages"])):
#         english_message, hindi_message = message_tuple
#         if i%2!=0:
#             hindi_message["content"] = english_message["content"]
            
#     return {"hindi_messages": sample["hindi_messages"]}

# def code_sample_handling(dataset):
#     non_codesubset = dataset.filter(lambda example: example["category"]!="Coding")
#     code_subset = dataset.filter(lambda example: example["category"]=="Coding")
#     code_subset = code_subset.map(
#         code_preprocess,
#         batched=False
#     )
#     dataset = concatenate_datasets([non_codesubset, code_subset])
#     return dataset

for split in ["train_sft", "test_sft"]:
    orig_dataset[split] = orig_dataset[split].filter(lambda example: example["category"]!="Coding")


In [139]:
orig_dataset

DatasetDict({
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'hindi_messages'],
        num_rows: 9166
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'hindi_messages'],
        num_rows: 484
    })
})

In [140]:
import random
def bernoulli_sample(p):
    """ Perform a Bernoulli trial with success probability 'p'. """
    return 1 if random.random() < p else 0

# Example: Bernoulli sampling with probability 0.2
p = 0.2
bernoulli_sample(p)

0

In [141]:
hindi_reply_prompt = "Hindi mein jawab dena."
english_reply_prompt = "{prefix} in Hindi."
cot_prompt = "First write in English and then translate to Hindi"
cot_hindi_prompt = "Pehle English mein likhna, phir Hindi mein translate karna."

def add_system_prompt(sample):
    p = 0.2
    language_p = 0.5
    suffix_hindi_prompt = bernoulli_sample(p)
    suffix_cot_prompt = bernoulli_sample(p)
    use_hindi_reply_prompt = bernoulli_sample(language_p)
    use_hindi_cot_prompt = bernoulli_sample(language_p)

    for i, turn in enumerate(sample["hindi_messages"]):
        if suffix_hindi_prompt:
            if use_hindi_reply_prompt and i%2==0:
                turn["content"] = f"{sample['messages'][i]['content']} {hindi_reply_prompt}"
            elif i%2==0:
                prefix = "Summarize" if sample["category"] == "Summarize" else "Reply"
                turn["content"] = f"{sample['messages'][i]['content']} {english_reply_prompt.format(prefix=prefix)}"
        elif suffix_cot_prompt:
            if use_hindi_cot_prompt and i%2==0:
                turn["content"] = f"{sample['messages'][i]['content']} {cot_hindi_prompt}"
            elif i%2==0:
                turn["content"] = f"{sample['messages'][i]['content']} {cot_prompt}"
            else:
                turn["content"] = f"{sample['messages'][i]['content']}\nTranslation of the above into Hindi:\n{turn['content']}"
                
    return {"hindi_messages": sample["hindi_messages"]}

orig_dataset = orig_dataset.map(
    add_system_prompt,
    batched=False
)
orig_dataset

Map:   0%|          | 0/9166 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

DatasetDict({
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'hindi_messages'],
        num_rows: 9166
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'hindi_messages'],
        num_rows: 484
    })
})

In [142]:
print(orig_dataset["train_sft"].filter(lambda example: cot_hindi_prompt in example["hindi_messages"][0]["content"]))
print(orig_dataset["train_sft"].filter(lambda example: cot_prompt in example["hindi_messages"][0]["content"]))
print(orig_dataset["train_sft"].filter(lambda example: hindi_reply_prompt in example["hindi_messages"][0]["content"]))
print(orig_dataset["train_sft"].filter(lambda example: english_reply_prompt.format(prefix="Summarize") in example["hindi_messages"][0]["content"]))
print(orig_dataset["train_sft"].filter(lambda example: english_reply_prompt.format(prefix="Reply") in example["hindi_messages"][0]["content"]))



Filter:   0%|          | 0/9166 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category', 'hindi_messages'],
    num_rows: 694
})


Filter:   0%|          | 0/9166 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category', 'hindi_messages'],
    num_rows: 725
})


Filter:   0%|          | 0/9166 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category', 'hindi_messages'],
    num_rows: 932
})


Filter:   0%|          | 0/9166 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category', 'hindi_messages'],
    num_rows: 34
})


Filter:   0%|          | 0/9166 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category', 'hindi_messages'],
    num_rows: 917
})


In [143]:
print(orig_dataset["train_sft"].filter(lambda example: cot_hindi_prompt in example["hindi_messages"][0]["content"])[0]["hindi_messages"])
print("".join(["----"]*100))
print(orig_dataset["train_sft"].filter(lambda example: cot_prompt in example["hindi_messages"][0]["content"])[0]["hindi_messages"])
print("".join(["----"]*100))
print(orig_dataset["train_sft"].filter(lambda example: hindi_reply_prompt in example["hindi_messages"][0]["content"])[0]["hindi_messages"])
print("".join(["----"]*100))
print(orig_dataset["train_sft"].filter(lambda example: english_reply_prompt.format(prefix="Summarize") in example["hindi_messages"][0]["content"])[0]["hindi_messages"])
print("".join(["----"]*100))
print(orig_dataset["train_sft"].filter(lambda example: english_reply_prompt.format(prefix="Reply") in example["hindi_messages"][0]["content"])[0]["hindi_messages"])
print("".join(["----"]*100))


[{'content': 'Write a funny, short story about someone who will stop at nothing to go green and reduce their carbon footprint. \n Pehle English mein likhna, phir Hindi mein translate karna.', 'role': 'user'}, {'content': 'Garry has a real green thumb, and taking care of the environment is extremely important to him. He takes every possible step to cut his carbon footprint, and it inspires and mystifies everyone around him in equal measure. Each night before he goes to bed, he cuts off the power to his house so that his electronic devices don’t draw idle power. \n\nAlmost all of his appliances are quite old, as Garry wants to get as much use as possible out of them before contributing to new pollution-generating production. The only exceptions are his new, water-efficient dishwasher and his solar-powered hot water tank. \n\nThe Homeowner’s Association hates Garry because he grows corn on his front lawn. All of his food is locally sourced as much as possible, and he commutes to work on a

In [162]:
# drop columns
for split in ["train_sft", "test_sft"]:
    orig_dataset[split] = orig_dataset[split].remove_columns(['prompt', 'prompt_id', 'messages'])
orig_dataset  

DatasetDict({
    train_sft: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 9166
    })
    test_sft: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 484
    })
})

In [163]:
english_to_hinglish_prompt_en = "Translate the following from English into Hinglish."
hingligh_to_english_prompt_en = "Translate the following from Hinglish into English."
english_to_hinglish_prompt_hn = "English se Hinglish mein translate kare."
hingligh_to_english_prompt_hn = "Hinglish se English mein translate kare."
hindi_to_hinglish_prompt_hi = "निम्नलिखित का हिंदी से हिंग्लिश में अनुवाद करें।"
hingligh_to_hindi_prompt_hi = "निम्नलिखित का हिंग्लिश से हिंदी में अनुवाद करें।"
hindi_to_hinglish_prompt_hn = "Hindi se Hinglish mein translate kare."
hingligh_to_hindi_prompt_hn = "Hinglish se Hindi mein translate kare."


In [165]:
# Hinglish (Code-Mixing) and Transliteration datasets
# add the transliteration datasets from Google. 
# https://github.com/google-research-datasets/Hinglish-TOP-Dataset
# https://huggingface.co/datasets/cmu_hinglish_dog
# Hinge
# from datasets import DatasetDict

hinge_ds = DatasetDict()
hinge_ds["train"] = concatenate_datasets([load_dataset("csv", data_files="train.csv")["train"], load_dataset("csv", data_files="valid.csv")["train"]])
hinge_ds["test"] = load_dataset("csv", data_files="test.csv")["train"]
hinge_ds

def format_to_norobots(sample):
    p = 0.25
    language_p = 0.5
    ranom_num = random.random()
    use_hn_prompt = bernoulli_sample(language_p)

    hindi_messages = []
    if ranom_num<= 0.25:
        #en_to_hn
        if use_hn_prompt:
            user_message = {"content":f"{english_to_hinglish_prompt_hn}\n{sample['English']}", "role": "user"}
        else:
            user_message = {"content":f"{english_to_hinglish_prompt_en}\n{sample['English']}", "role": "user"}
        asst_message = {"content":sample["Hinglish"], "role": "assistant"}
        hindi_messages.extend([user_message, asst_message])
    elif ranom_num<= 0.5:
        #hn_to_en
        if use_hn_prompt:
            user_message = {"content":f"{hingligh_to_english_prompt_hn}\n{sample['Hinglish']}", "role": "user"}
        else:
            user_message = {"content":f"{hingligh_to_english_prompt_en}\n{sample['Hinglish']}", "role": "user"}
        asst_message = {"content":sample["English"], "role": "assistant"}
        hindi_messages.extend([user_message, asst_message])
    elif ranom_num<= 0.75:
        #hi_to_hn
        if use_hn_prompt:
            user_message = {"content":f"{hindi_to_hinglish_prompt_hn}\n{sample['Hindi']}", "role": "user"}
        else:
            user_message = {"content":f"{hindi_to_hinglish_prompt_hi}\n{sample['Hindi']}", "role": "user"}
        asst_message = {"content":sample["Hinglish"], "role": "assistant"}
        hindi_messages.extend([user_message, asst_message])
    else:
        #hn_to_hi
        if use_hn_prompt:
            user_message = {"content":f"{hingligh_to_hindi_prompt_hn}\n{sample['Hinglish']}", "role": "user"}
        else:
            user_message = {"content":f"{hingligh_to_hindi_prompt_hi}\n{sample['Hinglish']}", "role": "user"}
        asst_message = {"content":sample["Hindi"], "role": "assistant"}
        hindi_messages.extend([user_message, asst_message])

    return {"category": "Transliteration and Code Mixing", "hindi_messages": hindi_messages}

hinge_ds = hinge_ds.map(
    format_to_norobots,
    batched=False,
    remove_columns=hinge_ds["train"].column_names
)
hinge_ds


Map:   0%|          | 0/3161 [00:00<?, ? examples/s]

Map:   0%|          | 0/791 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 3161
    })
    test: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 791
    })
})

In [183]:
orig_dataset["train"] = orig_dataset["train_sft"]
orig_dataset["test"] = orig_dataset["test_sft"]
del orig_dataset["train_sft"]
del orig_dataset["test_sft"]
orig_dataset

DatasetDict({
    train: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 9166
    })
    test: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 484
    })
})

In [184]:
for split in ["train", "test"]:
    orig_dataset[split] = concatenate_datasets([orig_dataset[split], hinge_ds[split]])
orig_dataset 

DatasetDict({
    train: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 12327
    })
    test: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 1275
    })
})

In [185]:
hinglish_top = load_dataset("rvv-karma/English-Hinglish-TOP")

hinglish_top["train"] = hinglish_top["train"].filter(lambda example: example["generated_by"]=="human")
hinglish_top["train"] = concatenate_datasets([hinglish_top["train"], hinglish_top["val"]])
del hinglish_top["val"]
hinglish_top


Downloading readme:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/194k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/943k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/176596 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/1390 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6513 [00:00<?, ? examples/s]

Filter:   0%|          | 0/176596 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'hi_en', 'en_parse', 'hi_en_parse', 'domain', 'generated_by'],
        num_rows: 7903
    })
    test: Dataset({
        features: ['en', 'hi_en', 'en_parse', 'hi_en_parse', 'domain', 'generated_by'],
        num_rows: 6513
    })
})

In [186]:
def format_to_norobots(sample):
    p = 0.5
    language_p = 0.5
    hn_to_en = bernoulli_sample(p)
    use_hn_prompt = bernoulli_sample(language_p)

    hindi_messages = []
    if not hn_to_en:
        #en_to_hn
        if use_hn_prompt:
            user_message = {"content":f"{english_to_hinglish_prompt_hn}\n{sample['en']}", "role": "user"}
        else:
            user_message = {"content":f"{english_to_hinglish_prompt_en}\n{sample['en']}", "role": "user"}
        asst_message = {"content":sample["hi_en"], "role": "assistant"}
        hindi_messages.extend([user_message, asst_message])
    else:
        #hn_to_en
        if use_hn_prompt:
            user_message = {"content":f"{hingligh_to_english_prompt_hn}\n{sample['hi_en']}", "role": "user"}
        else:
            user_message = {"content":f"{hingligh_to_english_prompt_en}\n{sample['hi_en']}", "role": "user"}
        asst_message = {"content":sample["en"], "role": "assistant"}
        hindi_messages.extend([user_message, asst_message])

    return {"category": "Transliteration and Code Mixing", "hindi_messages": hindi_messages}

hinglish_top = hinglish_top.map(
    format_to_norobots,
    batched=False,
    remove_columns=hinglish_top["train"].column_names
)
hinglish_top


Map:   0%|          | 0/7903 [00:00<?, ? examples/s]

Map:   0%|          | 0/6513 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 7903
    })
    test: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 6513
    })
})

In [190]:
hinglish_top["train"][2]

{'category': 'Transliteration and Code Mixing',
 'hindi_messages': [{'content': 'Hinglish se English mein translate kare.\nMere liye reminder set karo to wake up at 6:30 am tomorrow.',
   'role': 'user'},
  {'content': 'Set a reminder for me to wake up at 630 am tomorrow.',
   'role': 'assistant'}]}

In [187]:
for split in ["train", "test"]:
    orig_dataset[split] = concatenate_datasets([orig_dataset[split], hinglish_top[split]])
orig_dataset

DatasetDict({
    train: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 20230
    })
    test: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 7788
    })
})

In [191]:
# with 20% probability add the system prompt 
system_prompt = "You are a native Hindi speaker who can converse at expert level in both Hindi and colloquial Hinglish."
def add_system_prompt(sample):
    p = 0.2
    add_system_message = bernoulli_sample(p)
    first_message = sample["hindi_messages"][0]
    if add_system_message:
        if first_message["role"] == "system":
            first_message["content"] = f"{first_message['content']} {system_prompt}"
        else:
            system_message = {'content': system_prompt, 'role': 'system'}
            sample["hindi_messages"].insert(0, system_message)
    return {"hindi_messages": sample["hindi_messages"]}

orig_dataset = orig_dataset.map(
    add_system_prompt,
    batched=False
)
orig_dataset


Map:   0%|          | 0/20230 [00:00<?, ? examples/s]

Map:   0%|          | 0/7788 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 20230
    })
    test: Dataset({
        features: ['category', 'hindi_messages'],
        num_rows: 7788
    })
})

In [193]:
orig_dataset["train"].filter(lambda example: example["hindi_messages"][0]["role"]=="system")[0]

Filter:   0%|          | 0/20230 [00:00<?, ? examples/s]

{'category': 'Generation',
 'hindi_messages': [{'content': 'You are a native Hindi speaker who can converse at expert level in both Hindi and colloquial Hinglish.',
   'role': 'system'},
  {'content': 'Help write a letter of 100 -200 words to my future self for Kyra, reflecting on her goals and aspirations. First write in English and then translate to Hindi',
   'role': 'user'},
  {'content': "Dear Future Self,\n\nI hope you're happy and proud of what you've achieved. As I write this, I'm excited to think about our goals and how far you've come. One goal was to be a machine learning engineer. I hope you've worked hard and become skilled in this field. Keep learning and innovating. Traveling was important to us. I hope you've seen different places and enjoyed the beauty of our world. Remember the memories and lessons. Starting a family mattered to us. If you have kids, treasure every moment. Be patient, loving, and grateful for your family.\n\nTake care of yourself. Rest, reflect, and c

In [196]:
for split in ["train", "test"]:
    orig_dataset[split] = orig_dataset[split].rename_column('hindi_messages', 'messages')
orig_dataset  

DatasetDict({
    train: Dataset({
        features: ['category', 'messages'],
        num_rows: 20230
    })
    test: Dataset({
        features: ['category', 'messages'],
        num_rows: 7788
    })
})

In [197]:
orig_dataset.push_to_hub("hindi_instruct_v1", private=False)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/493 [00:00<?, ?B/s]

In [198]:
from datasets import load_dataset
load_dataset("smangrul/hindi_instruct_v1")

Downloading readme:   0%|          | 0.00/487 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/20230 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7788 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['category', 'messages'],
        num_rows: 20230
    })
    test: Dataset({
        features: ['category', 'messages'],
        num_rows: 7788
    })
})