In [None]:
!huggingface-cli login --token $TOKEN$ --add-to-git-credential 

In [None]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

dataset = load_dataset("mlsquare/samantar_merged_with_train_val")
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

In [None]:
# DATA COMBINATION

In [None]:
from collections import defaultdict  # For creating a dictionary with default values

# Import necessary libraries
from datasets import (  # For loading datasets and concatenating them
    DatasetDict,
    concatenate_datasets,
    load_dataset,
)
from tqdm import tqdm  # For progress bar visualization

# List of language codes for datasets
language_codes = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]

# Empty list to store datasets
datasets_list = []

# Loop through each language code
for code in tqdm(language_codes):
    # Load dataset for each language code
    dataset = load_dataset("ai4bharat/samanantar", code)
    # Append only the training portion of the dataset to the list
    datasets_list.append(dataset["train"])

# Concatenate all the datasets into a single dataset
merged_dataset = concatenate_datasets(datasets_list)

# Function to filter concatenated dataset based on a limit per language


def filter_concatenated_dataset(merged_dataset, limit_per_language=5000):
    # Dictionary to store filtered data
    filtered_dict = defaultdict(list)
    total = 0  # Counter for total number of samples processed

    # Loop through each sample in the merged dataset
    for sample in tqdm(merged_dataset):
        # Check if the total number of samples processed is less than the limit per language times the number of languages
        if total < limit_per_language * len(language_codes):
            # Append the target and source texts to the filtered dictionary
            filtered_dict["tgt"].append(sample["tgt"])
            filtered_dict["src"].append(sample["src"])
            total += 1  # Increment the counter
        else:
            break  # If the limit is reached, exit the loop

    # Calculate the percentage of data retained after filtering
    print(f"{len(filtered_dict['tgt'])/total:.2%} of data after filtering.")

    # Return the filtered dataset as a Dataset object
    return Dataset.from_dict({"tgt": filtered_dict["tgt"], "src": filtered_dict["src"]})

In [None]:
filtered_merged_dataset = filter_concatenated_dataset(
    merged_dataset, limit_per_language=10000000
)

In [None]:
# Determine the number of samples for train and validation sets
from datasets import Dataset, DatasetDict

train_size = int(len(filtered_merged_dataset) * 0.8)  # 80% for training
valid_size = len(filtered_merged_dataset) - train_size  # Remaining for validation

# Split the dataset into train and validation sets
ds_train = filtered_merged_dataset.select(list(range(train_size)))
ds_valid = filtered_merged_dataset.select(
    list(range(train_size, train_size + valid_size))
)

# Create DatasetDict with train and validation sets
raw_datasets = DatasetDict({"train": ds_train, "valid": ds_valid})

In [None]:
raw_datasets.push_to_hub("mlsquare/samantar_merged_with_train_val")