# Task1: Dataset Generation

**Install Libraries and Packages**

In [1]:
!pip install persian-tools

Collecting persian-tools
  Downloading persian_tools-0.0.11-py3-none-any.whl.metadata (11 kB)
Downloading persian_tools-0.0.11-py3-none-any.whl (34 kB)
Installing collected packages: persian-tools
Successfully installed persian-tools-0.0.11


In [2]:
!pip install jdatetime

Collecting jdatetime
  Downloading jdatetime-5.0.0-py3-none-any.whl.metadata (5.4 kB)
Collecting jalali-core>=1.0 (from jdatetime)
  Downloading jalali_core-1.0.0-py3-none-any.whl.metadata (738 bytes)
Downloading jdatetime-5.0.0-py3-none-any.whl (12 kB)
Downloading jalali_core-1.0.0-py3-none-any.whl (3.6 kB)
Installing collected packages: jalali-core, jdatetime
Successfully installed jalali-core-1.0.0 jdatetime-5.0.0


In [3]:
import random
from persian_tools import digits
import jdatetime
import pandas as pd
import itertools

**Helper Functions**

In [4]:
def convert_month_number_to_name(number):
  persian_months = {
      1: "فروردین",
      2: "اردیبهشت",
      3: "خرداد",
      4: "تیر",
      5: "مرداد",
      6: "شهریور",
      7: "مهر",
      8: "آبان",
      9: "آذر",
      10: "دی",
      11: "بهمن",
      12: "اسفند"
  }

  return persian_months.get(number, "")

In [5]:
persian_digits = {
    '0': '۰',
    '1': '۱',
    '2': '۲',
    '3': '۳',
    '4': '۴',
    '5': '۵',
    '6': '۶',
    '7': '۷',
    '8': '۸',
    '9': '۹'
}

def convert_to_persian_digits(number_str):
    return ''.join(persian_digits.get(digit, digit) for digit in number_str)


In [6]:
def add_m_suffix(word):
    if word.endswith('ه'):
        return word[:-1] + 'هم'
    else:
        return word + 'م'

In [7]:
def get_day_representations(day):
    day_standard = str(day)
    day_persian = convert_to_persian_digits(day_standard)
    day_word = digits.convert_to_word(day)
    day_word_m = add_m_suffix(day_word)
    day_with_ruz = [f"روز {day_standard}", f"روز {day_persian}", f"روز {day_word}"]
    return [day_standard, day_persian, day_word, day_word_m] + day_with_ruz

In [8]:
def get_month_representations(month):
    month_standard = str(month)
    month_persian = convert_to_persian_digits(month_standard)
    month_word = digits.convert_to_word(month)
    month_word_m = add_m_suffix(month_word)
    month_name = convert_month_number_to_name(month)
    month_with_mah = [f"ماه {month_standard}", f"ماه {month_persian}", f"ماه {month_word}", f"ماه {month_name}"]
    return [month_standard, month_persian, month_word, month_word_m, month_name] + month_with_mah


In [9]:
def get_year_representations(year):
    year_standard = str(year)
    year_persian = convert_to_persian_digits(year_standard)
    year_word = digits.convert_to_word(year)
    year_with_sal = [f"سال {year_standard}", f"سال {year_persian}", f"سال {year_word}"]
    return [year_standard, year_persian, year_word] + year_with_sal

**Create Dataset**

In [10]:
def generate_dataset(date):
    day = date['day']
    month = date['month']
    year = date['year']
    formal_date = f"{year}/{month:02d}/{day:02d}"

    day_reps = get_day_representations(day)
    month_reps = get_month_representations(month)
    year_reps = get_year_representations(year)

    separators = ['/', '.', '-', ' ']


    patterns = [
        "{day} {month} {year}",
        "{day} {month} {year_with_sal}",
        "در {day} {month} {year}",
        "به تاریخ {day} {month} {year}",
        "{day} {month} سال {year}",
        "{day_with_ruz} {month} {year}",
        "{day} {month_with_mah} {year}",
        "{day} {month} {year_persian}",
        "{day_persian} {month_persian} {year_persian}",
        # New patterns with order changes
        "{year} {month} {day}",
        "{year} {day} {month}",
        "{month} {day} {year}",
        "{day} {year} {month}",
        "{month} {year} {day}",
        # Numeric formats with separators
        "{year_num}{sep}{month_num}{sep}{day_num}",
        "{day_num}{sep}{month_num}{sep}{year_num}",
        "{month_num}{sep}{day_num}{sep}{year_num}",
        "{year_num}{sep}{day_num}{sep}{month_num}",
        "{day_num}{sep}{year_num}{sep}{month_num}",
        "{month_num}{sep}{year_num}{sep}{day_num}",
    ]




    data = []
    for pattern in patterns:
        for day_rep in day_reps:
            for month_rep in month_reps:
                for year_rep in year_reps:
                    if 'num' in pattern:
                        for sep in separators:
                            informal_text = pattern.format(
                                day_num=str(day),
                                month_num=str(month),
                                year_num=str(year),
                                sep=sep
                            ).strip()
                            data.append({'informal_text': informal_text, 'formal_date': formal_date})
                            informal_text_persian = pattern.format(
                                day_num=convert_to_persian_digits(str(day)),
                                month_num=convert_to_persian_digits(str(month)),
                                year_num=convert_to_persian_digits(str(year)),
                                sep=sep
                            ).strip()
                            data.append({'informal_text': informal_text_persian, 'formal_date': formal_date})
                    else:
                        informal_text = pattern.format(
                            day=day_rep,
                            day_with_ruz=day_rep,
                            day_persian=convert_to_persian_digits(day_rep),
                            month=month_rep,
                            month_with_mah=month_rep,
                            month_persian=convert_to_persian_digits(month_rep),
                            year=year_rep,
                            year_with_sal=year_rep,
                            year_persian=convert_to_persian_digits(year_rep),
                            year_num=str(year),
                            month_num=str(month),
                            day_num=str(day),
                        ).strip()
                        data.append({'informal_text': informal_text, 'formal_date': formal_date})

    return data

In [11]:
dates_list = [
    {'day': 1 , 'month': 1, 'year': 1370},
    # {'day': 15, 'month': 7, 'year': 1380},
    # {'day': 29, 'month': 12, 'year': 1380},
    # {'day': 30, 'month': 12, 'year': 1390},
    # {'day': 22, 'month': 11, 'year': 1390},
    # {'day': 13, 'month': 1, 'year': 1396},
    # {'day': 10, 'month': 2, 'year': 1400},
    # {'day': 30, 'month': 12, 'year': 1403},
    # {'day': 22 ,'month': 3, 'year': 1410},
    # {'day': 17, 'month': 5, 'year': 1415},
]

dataset = []

for date_info in dates_list:
  dataset.extend(generate_dataset(date_info))


=df = pd.DataFrame(dataset)
df.drop_duplicates(inplace=True)
df.to_csv('dataset.csv', index=False, encoding='utf-8-sig')

print(f"Generated {len(df)} entries in the dataset.")

Generated 2943 entries in the dataset.


# Task 2: Model Development

**Install Libraries and Packages**

In [12]:
!pip install hazm

Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)
Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm)
  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Downloading hazm-0.10.0-py3-none-any.whl (892 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m892.6/892.6 kB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hDo

**Preprocessing**

In [13]:
from hazm import Normalizer

In [14]:
normalizer = Normalizer(persian_numbers=False)

def normalize_text(text):
    normalized_text = normalizer.normalize(text)
    return normalized_text

df['informal_text_normalized'] = df['informal_text'].apply(normalize_text)
df['formal_date_normalized'] = df['formal_date'].apply(normalize_text)


In [15]:
df.to_csv('dataset_normalized.csv', index=False, encoding='utf-8-sig')

In [17]:
from transformers import T5Tokenizer
from datasets import Dataset

In [18]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [19]:
def preprocess_data(examples, tokenizer, max_length=128):
    inputs = examples['informal_text_normalized']
    targets = examples['formal_date_normalized']

    # Tokenize input and target sequences
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True, padding='max_length')

    # Add the labels to the input dictionary
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [20]:
dataset = Dataset.from_pandas(df)

In [21]:
preprocessed_dataset = dataset.map(lambda x: preprocess_data(x, tokenizer), batched=True)

Map:   0%|          | 0/9441 [00:00<?, ? examples/s]



In [22]:
train_test_split = preprocessed_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

**Function to compute metrics**

In [23]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    accuracy = accuracy_score(decoded_labels, decoded_preds)
    return {"accuracy": accuracy}


In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True
)

In [25]:
from transformers import T5ForConditionalGeneration, Trainer

model = T5ForConditionalGeneration.from_pretrained("t5-small")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Add the compute_metrics function
)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


In [26]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.37 GiB. GPU 0 has a total capacity of 14.75 GiB of which 6.32 GiB is free. Process 5312 has 8.43 GiB memory in use. Of the allocated memory 7.17 GiB is allocated by PyTorch, and 1.13 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")