In [25]:
!pip install -q transformers datasets scikit-learn torch shekar cleantext

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [6]:
import pandas as pd
df = pd.read_csv('/kaggle/input/dataset/final_generated_pairs.csv', header=None,names=['name1', 'name2','label'])
df = df.drop(df.index[0])
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,name1,name2,label
0,کالا پخش عصر ایلام,کالا پخش عصر ایلا,1
1,کالا پخش عصر ایلام,کالا عصر ایلام پخش,1
2,کالا پخش عصر ایلام,کالا پخش عصر ایلام,1
3,کالا پخش عصر ایلام,کالا ایلام پخش عصر,1
4,کالا پخش عصر ایلام,کالا عصر پخش ایلام,1


In [7]:
PERSIAN_STOPWORDS = {
    "شرکت","موسسه","گروه","صنعت","صنایع","توسعه","مهندسی","فناوری","نوین",
    "تک","ارتباط","مبین","پیشرفته","گسترش","مرکز","هولدینگ",
    "مدرن","نو","جدید","پژوهش","کاربردی","راهکار","راه","راه‌حل",
    "اندیشه","سامانه","خدمات","تجارت","تجاری","بازرگانی","کو","ایران",
   "و", "در", "با", "از",
}

In [26]:
from shekar import Normalizer, Stemmer, WordTokenizer, Lemmatizer
from cleantext import clean
import re

normalizer = Normalizer()
lemmatizer = Lemmatizer()
tokenizer = WordTokenizer()

def preprocess_and_stem(text):
    text = normalizer.normalize(text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = text.replace("\u200c", "")
    text = str(clean(text,
                           clean_all= False  ,
                           extra_spaces=True ,
                           numbers=True ,
                           punct=True
                           ))

    tokens = list(tokenizer(text))

    stems =  [lemmatizer(t) for t in tokens if t not in PERSIAN_STOPWORDS]
    return " ".join(stems)

In [9]:
df["label"] = df["label"].map({
    "1": 1,
    "0": 0
}).astype(int)

In [10]:
from sklearn.model_selection import train_test_split
 
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

Train size: 114437, Validation size: 24522, Test size: 24523


In [11]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [12]:
from transformers import AutoTokenizer

model_name = "PartAI/TookaBERT-Base"   
tokenizer = AutoTokenizer.from_pretrained(model_name)
 
def tokenize_function(example):
    return tokenizer(example['name1'], example['name2'], truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/463 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/145 [00:00<?, ?B/s]

Map:   0%|          | 0/114437 [00:00<?, ? examples/s]

Map:   0%|          | 0/24522 [00:00<?, ? examples/s]

Map:   0%|          | 0/24523 [00:00<?, ? examples/s]

In [13]:
import torch
from transformers import DataCollatorWithPadding

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
from transformers import AutoModelForSequenceClassification

num_labels = 2  
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at PartAI/TookaBERT-Base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

training_args = TrainingArguments(
    output_dir="./tookabert-finetuned",   
    save_steps=3000, 
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir=None,
    metric_for_best_model="accuracy",
    fp16=True
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [16]:
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device name:", torch.cuda.get_device_name(0))
    print("Device count:", torch.cuda.device_count())
else:
    print("Running on CPU")

PyTorch version: 2.6.0+cu124
CUDA available: True
Device name: Tesla T4
Device count: 2


In [17]:
trainer.train()



Step,Training Loss
500,0.0397
1000,0.0044
1500,0.0042
2000,0.0102
2500,0.0
3000,0.0039
3500,0.0031
4000,0.0019
4500,0.0
5000,0.0025




TrainOutput(global_step=7154, training_loss=0.005350069574532327, metrics={'train_runtime': 1411.3626, 'train_samples_per_second': 162.165, 'train_steps_per_second': 5.069, 'total_flos': 2651845451984220.0, 'train_loss': 0.005350069574532327, 'epoch': 2.0})

In [18]:
results = trainer.evaluate(test_dataset)
print(results)



{'eval_loss': 0.00036790024023503065, 'eval_accuracy': 0.9999592219548995, 'eval_f1': 0.9999474927802573, 'eval_precision': 1.0, 'eval_recall': 0.9998949910742413, 'eval_runtime': 49.2366, 'eval_samples_per_second': 498.064, 'eval_steps_per_second': 15.578, 'epoch': 2.0}


In [19]:
model.save_pretrained("./tookabert-finetuned")
tokenizer.save_pretrained("./tookabert-finetuned")

('./tookabert-finetuned/tokenizer_config.json',
 './tookabert-finetuned/special_tokens_map.json',
 './tookabert-finetuned/tokenizer.json')

In [21]:
def check_name_validity(text1:str, text2 :str):
    inputs = tokenizer(text1, text2, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    prob = outputs.logits.softmax(dim=1)[0][1].item()
    return prob