In [26]:
!pip install -q sentence-transformers shekar cleantext datasets

In [27]:
from sentence_transformers import SentenceTransformer, InputExample, losses, SentenceTransformer
from torch.utils.data import DataLoader
from shekar import Normalizer, Stemmer, WordTokenizer, Lemmatizer
from cleantext import clean
import torch

In [28]:
import re

normalizer = Normalizer()
lemmatizer = Lemmatizer()
tokenizer = WordTokenizer()

def preprocess_and_stem(text):
    text = normalizer.normalize(text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = text.replace("\u200c", "")
    text = str(clean(text,
                           clean_all= False  ,
                           extra_spaces=True ,
                           numbers=True ,
                           punct=True
                           ))

    tokens = list(tokenizer(text))

    stems =  [lemmatizer(t) for t in tokens if t not in PERSIAN_STOPWORDS]
    return " ".join(stems)

In [29]:
PERSIAN_STOPWORDS = {
    "شرکت","موسسه","گروه","صنعت","صنایع","توسعه","مهندسی","فناوری","نوین",
    "تک","ارتباط","مبین","پیشرفته","گسترش","مرکز","هولدینگ",
    "مدرن","نو","جدید","پژوهش","کاربردی","راهکار","راه","راه‌حل",
    "اندیشه","سامانه","خدمات","تجارت","تجاری","بازرگانی","کو","ایران",
   "و", "در", "با", "از",
}

In [30]:
import pandas as pd
df = pd.read_csv('final_generated_pairs.csv', header=None,names=['name1', 'name2','label'])
df = df.drop(df.index[0])
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,name1,name2,label
0,کالا پخش عصر ایلام,کالا پخش عصر ایلا,1
1,کالا پخش عصر ایلام,کالا عصر ایلام پخش,1
2,کالا پخش عصر ایلام,کالا پخش عصر ایلام,1
3,کالا پخش عصر ایلام,کالا ایلام پخش عصر,1
4,کالا پخش عصر ایلام,کالا عصر پخش ایلام,1


In [31]:
df["label"] = df["label"].map({
    "1": 1,
    "0": 0
}).astype(int)

In [36]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn

MODEL_NAME = "HooshvareLab/bert-base-parsbert-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)


In [37]:
import torch
from torch.utils.data import Dataset, DataLoader

class NamePairDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=32):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        t1 = self.tokenizer(row["name1"], padding="max_length", truncation=True,
                            max_length=self.max_len, return_tensors="pt")
        t2 = self.tokenizer(row["name2"], padding="max_length", truncation=True,
                            max_length=self.max_len, return_tensors="pt")
        label = torch.tensor(row["label"], dtype=torch.float)
        return t1, t2, label

dataset = NamePairDataset(df, tokenizer)
loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [38]:
import torch.nn as nn
import torch.nn.functional as F

class SiameseBERT(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(bert.config.hidden_size, 128)
        self.out = nn.Linear(128, 1)

    def encode(self, t):
        x = self.bert(input_ids=t["input_ids"].squeeze(),
                      attention_mask=t["attention_mask"].squeeze())
        cls = x.last_hidden_state[:,0]
        x = self.dropout(cls)
        x = F.relu(self.fc(x))
        return x

    def forward(self, t1, t2):
        e1 = self.encode(t1)
        e2 = self.encode(t2)
        diff = torch.abs(e1 - e2)
        sim = torch.sigmoid(self.out(diff))
        return sim.squeeze()



In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SiameseBERT(bert_model).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

epochs = 3

for epoch in range(epochs):
    total_loss = 0
    model.train()
    for t1, t2, label in loader:
        t1 = {k:v.to(device) for k,v in t1.items()}
        t2 = {k:v.to(device) for k,v in t2.items()}
        label = label.to(device)

        optimizer.zero_grad()
        output = model(t1, t2)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")



In [None]:
model_path = "/content/siamese_bert_fa"
torch.save(model.state_dict(), model_path + "/model.pt")
tokenizer.save_pretrained(model_path)
print("Saved to:", model_path)


In [None]:
def similarity(name1, name2):
    t1 = tokenizer(name1, return_tensors="pt", padding=True).to(device)
    t2 = tokenizer(name2, return_tensors="pt", padding=True).to(device)
    model.eval()
    with torch.no_grad():
        sim = model(t1, t2).item()
    return sim  # 0 تا 1


In [None]:
print(similarity("صنایع غذایی میهن","صنعت غذای میهن"))
print(similarity("تجارت آرین","تاجر آرین"))