In [1]:
!pip install -q sentence-transformers rapidfuzz shekar



In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
from rapidfuzz import fuzz
from itertools import permutations




In [3]:
PERSIAN_STOPWORDS = {
    "شرکت","موسسه","گروه","صنعت","صنایع","توسعه","مهندسی","فناوری","نوین",
    "تک","ارتباط","مبین","پیشرفته","گسترش","مرکز","هولدینگ",
    "مدرن","نو","جدید","پژوهش","کاربردی","راهکار","راه","راه‌حل",
    "اندیشه","سامانه","خدمات","تجارت","تجاری","بازرگانی","کو","ایران",
   "و", "در", "با", "از",
} 

In [4]:
from shekar import Normalizer, Lemmatizer, WordTokenizer
import re
from cleantext import clean

normalizer = Normalizer()
lemmatizer = Lemmatizer()
tokenizer = WordTokenizer()

def preprocess_and_stem(text): 
    text = normalizer.normalize(text) 
    text = re.sub(r"[^\w\s]", " ", text)
    text = text.replace("\u200c", "")
    text = str(clean(text,
                           clean_all= False  ,
                           extra_spaces=True ,     
                           numbers=True ,  
                           punct=True      
                           ))
     
    tokens = list(tokenizer(text))
     
    stems =  [lemmatizer(t) for t in tokens if t not in PERSIAN_STOPWORDS]
    return " ".join(stems)

In [5]:
import pandas as pd
data = pd.read_csv('data_sample.csv', header=None, names=['id', 'name'])
data = data.drop(data.index[0])
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,id,name
0,6C08073088A84609A36859D777FDA74C,كالا پخش عصر ايلام
1,D14E57BFCC2A46438127DB0E9D44E53D,تك ناهيد آدان
2,1F602F123D314279816445A72D249AE8,آراد پارس پويا شمال
3,1DF530F7C9404386B1048F885A9F3CEF,شفق الكتريك هزاره سوم
4,D9A3EB25A9804B679DF5DD3BE7E212E0,امامزاده بي بي بانو روستاي بيان(قير)


In [6]:
registered_names = data['name'].apply(preprocess_and_stem)

In [7]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: eb770f3a-2460-4988-a3d7-0e1f1e1fec5d)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/./README.md
Retrying in 1s [Retry 1/5].


README.md: 0.00B [00:00, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 35180696-b0c7-4a21-8c60-bb2efffd007a)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 6022a771-cba1-4c25-ad1b-7aeb03f41b3e)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/config.json
Retrying in 1s [Retry 1/5].


config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: eb46b773-8470-435f-8a44-e3ab6ce1b9bf)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
def exact_match(name, registered_names):
    return name in registered_names

In [9]:
def is_subsequence(name, registered_names):
    name_tokens = set(name.split())
    for reg in registered_names:
        reg_tokens = set(reg.split())
        if name_tokens.issubset(reg_tokens) or reg_tokens.issubset(name_tokens):
            return True
    return False

In [10]:
def is_permutation(name, registered_names):
    name_tokens = sorted(name.split())
    for reg in registered_names:
        if name_tokens == sorted(reg.split()):
            return True
    return False

In [11]:
def is_fuzzy_match(name, registered_names, threshold=85):
    for reg in registered_names:
        score = fuzz.token_sort_ratio(name, reg)
        if score >= threshold:
            return True
    return False

In [12]:
def is_semantic_similar(name, registered_names, threshold=0.85):
    name_vec = model.encode([name])[0]
    reg_vecs = model.encode(registered_names)
    
    for vec in reg_vecs:
        sim = np.dot(name_vec, vec) / (np.linalg.norm(name_vec) * np.linalg.norm(vec))
        if sim >= threshold:
            return True
    return False

In [13]:
def check_name_validity(name, registered_names):
    if exact_match(name, registered_names):
        return False, "اسم دقیقا ثبت شده است"
    if is_subsequence(name, registered_names):
        return False, "اسم زیرمجموعه اسم ثبت شده است"
    if is_permutation(name, registered_names):
        return False, "اسم جایگشت اسم ثبت شده است"
    if is_fuzzy_match(name, registered_names):
        return False, "اسم مشابه اسم ثبت شده است (تغییر جزئی یا جمع/مفرد)"
    if is_semantic_similar(name, registered_names):
        return False, "اسم از نظر معنایی مشابه است"
    return True, "اسم قابل ثبت است"

In [14]:
test_names = [
    "صنایع میهن",           
    "مهندسی هوشمند روزآمد تهران",  
    "صنایع غذای میهن",      
    "تاجر آرمان",           
    "فن‌آوری نوین پارس"     
]

for name in test_names:
    valid, reason = check_name_validity(name, registered_names)
    print(name, "->", valid, "-", reason)


صنایع میهن -> False - اسم از نظر معنایی مشابه است
مهندسی هوشمند روزآمد تهران -> True - اسم قابل ثبت است
صنایع غذای میهن -> False - اسم از نظر معنایی مشابه است
تاجر آرمان -> False - اسم از نظر معنایی مشابه است
فن‌آوری نوین پارس -> False - اسم از نظر معنایی مشابه است
