In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import json
import Levenshtein
from scipy.stats import entropy
import re
from sklearn.metrics.pairwise import euclidean_distances
import pickle
import joblib

In [2]:
attributes_path = './data/test/attributes_test.parquet'
resnet_path = './data/test/resnet_test.parquet'
text_and_bert_path = './data/test/text_and_bert_test.parquet'
val_path = './data/test/test.parquet'

attributes = pd.read_parquet(attributes_path, engine='pyarrow')
attributes.set_index('variantid', inplace=True)
resnet = pd.read_parquet(resnet_path, engine='pyarrow')
resnet.set_index('variantid', inplace=True)
text = pd.read_parquet(text_and_bert_path, engine='pyarrow')
text.set_index('variantid', inplace=True)
test = pd.read_parquet(val_path, engine='pyarrow')

In [4]:
with open('my_data/unique_cats2.pkl', 'rb') as f:
    unique_cats2 = pickle.load(f)

with open('my_data/unique_cats3.pkl', 'rb') as f:
    unique_cats3 = pickle.load(f)

with open('my_data/minor_keys.pkl', 'rb') as f:
    minor_keys = pickle.load(f)

with open('my_data/vital_keys.pkl', 'rb') as f:
    vital_keys = pickle.load(f)

In [5]:
CONFIG = {
    # attributes
    'cat_fit': True, # Количество одинаковых категорий / 4 
    'cat2': True, # Категориальный признак 2 категории
    'cat3': True, # Категориальный признак 3 категории
    'jac_attrs': True, # Сходство Жаккара для аттрибутов (только ключей) 
    'jacm_attrs': True,
    'com_attrs': True, # Количество общих атрибутов
    'jac_vals': True, # Среднее сходств Жаккара для значений (для общих ключей)
    'com_vals': True, # Количество одинаковых значений общих атрибутов
    'jac_num_vals': True, # Сходство Жаккара для числовых общих значений
    'rat_num_vals': True, # Среднее процентное отношение значений общих атрибутов
    'jac_sev_vals': True, # Сходство Жаккара для общих значений в ключах (значений в ключе > 1)
    'jac_vital_vals':True, # Сходство значений значимых ключей (по статистике выше)
    'jac_minor_vals':True, # Сходство незначений значимых ключей (по статистике выше)
    'diff_attrs': True, # Разность количеств аттрибутов (ключей) / max(len(attrs1), len(attrs2))
    
    # text
    'n_len_diff': True, # Разность количеств символов имен / max(len(name1), len(name2))
    'd_len_diff': True, # Разность количеств символов описаний / max(len(desc1), len(desc2))
    'n_lev': True, # Расстояние Левенштейна между именами / max(len(name1), len(name2))
    'd_lev': True, # Расстояние Левенштейна между описаниями / max(len(desc1), len(desc2))
    'n_jac_symbs': True, # Жаккарово сравнение символов имен
    'd_jac_symbs': True, # Жаккарово сравнение символов описаний
    
    'n_jac': True, # Жаккарово сравнение имен по 6 вариантам токенизации
    'd_jac': True, # Жаккарово сравнение описаний по 6 вариантам токенизации
    'n_lev_opers': True, # Расстояние Левенштейна между именами / max(len(name1), len(name2))
    'd_lev_opers': True, # Расстояние Левенштейна между описаниями / max(len(desc1), len(desc2))
    
    # resnet
    'm_cos': True, # Косинусное сходство между основными эмбеддингами
    'm_evklid': True, # Евклидово расстояние между основными эмбеддингами
    'e_jac': True, # Жаккарово сравнение дополнительных эмбеддингов
    'e_diff': True, # Разница в количестве дополнительных эмбеддингов
    'm_ent_diff': True, # Энтропия (не знаю что за функция) основных эмбеддингов
    'e_avg_cos': True, # Косинусное сходство между средним эмбеддингом дополнительных эмбеддингов
}

In [6]:
def preprocess_texts(sentence):
    if sentence is None:
        return None
    sentence = sentence.lower()
    sentence = re.sub(r'\s+', ' ', sentence) 
    return sentence
text['name'] = text['name'].apply(preprocess_texts)
text['description'] = text['description'].apply(preprocess_texts)

In [7]:
if CONFIG.get('cat_fit', False):
    def cat_fit(test_df, attributes_df):
        
        def compute_cat_fit(variantid1, variantid2):
            
            cat1 = json.loads(attributes_df.at[variantid1, 'categories'])
            cat2 = json.loads(attributes_df.at[variantid2, 'categories'])
                
            common_keys = set(cat1.keys()) & set(cat2.keys())
            
            return (sum(1 for key in common_keys if cat1[key] == cat2[key])) / len(common_keys)
            
        tqdm.pandas(desc="Processing test_df")
        test_df['cat_fit'] = test_df.progress_apply(lambda row: compute_cat_fit(row['variantid1'], row['variantid2']), axis=1)

        return test_df
    
    test = cat_fit(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 42350.72it/s]


In [8]:
if CONFIG.get('cat2', False):
    def cat2(test_df, attributes_df):
        
        def compute_cat2(variantid1, variantid2, u_cats=unique_cats2):
            
            cat1 = json.loads(attributes_df.at[variantid1, 'categories'])
            cat2 = json.loads(attributes_df.at[variantid2, 'categories'])
            
            if cat1['2'] == cat2['2']:
                if cat1['2'] in u_cats:
                    return u_cats[cat1['2']]
                else:
                    return -2
            else:
                return -1
            
        tqdm.pandas(desc="Processing test_df")
        test_df['cat2'] = test_df.progress_apply(lambda row: compute_cat2(row['variantid1'], row['variantid2']), axis=1)

        return test_df

    test = cat2(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 67994.45it/s]


In [9]:
if CONFIG.get('cat3', False):
    def cat3(test_df, attributes_df):
        
        def compute_cat3(variantid1, variantid2, u_cats=unique_cats3):
            
            cat1 = json.loads(attributes_df.at[variantid1, 'categories'])
            cat2 = json.loads(attributes_df.at[variantid2, 'categories'])
            
            if cat1['3'] == cat2['3']:
                if cat1['3'] in u_cats:
                    return u_cats[cat1['3']]
                else:
                    return -2
            else:
                return -1
            
        tqdm.pandas(desc="Processing test_df")
        test_df['cat3'] = test_df.progress_apply(lambda row: compute_cat3(row['variantid1'], row['variantid2']), axis=1)

        return test_df

    test = cat3(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 68574.37it/s]


In [10]:
if CONFIG.get('jac_attrs', False):
    def jac_attrs(test_df, attributes_df):

        def compute_jac_attrs(variantid1, variantid2):
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])

            keys1 = set(attrs1.keys())
            keys2 = set(attrs2.keys())

            intersection = len(keys1.intersection(keys2))
            union = len(keys1.union(keys2))

            if intersection == 0:
                return -2

            return intersection / union if union != 0 else 0

        tqdm.pandas(desc="Processing test_df")
        test_df['jac_attrs'] = test_df.progress_apply(
            lambda row: compute_jac_attrs(row['variantid1'], row['variantid2']), axis=1)

        return test_df


    test = jac_attrs(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:09<00:00, 5225.91it/s]


In [11]:
if CONFIG.get('jacm_attrs', False):
    def jac_attrs(test_df, attributes_df):
        
        def compute_jac_attrs(variantid1, variantid2):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            keys1 = set(attrs1.keys())
            keys2 = set(attrs2.keys())

            # Количество общих элементов (пересечение множеств)
            intersection = len(keys1.intersection(keys2))
            
            # Доля общих элементов относительно множества 1
            ratio1 = intersection / len(keys1)
            
            # Доля общих элементов относительно множества 2
            ratio2 = intersection / len(keys2)
            
            # Возвращаем максимальное значение из двух долей
            return max(ratio1, ratio2)

        tqdm.pandas(desc="Processing test_df")
        test_df['jacm_attrs'] = test_df.progress_apply(lambda row: compute_jac_attrs(row['variantid1'], row['variantid2']), axis=1)

        return test_df

    test = jac_attrs(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 40108.59it/s]


In [13]:
if CONFIG.get('com_attrs', False):
    def com_attrs(test_df, attributes_df):
        def compute_com_attrs(variantid1, variantid2):
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])

            keys1 = set(attrs1.keys())
            keys2 = set(attrs2.keys())

            return len(keys1 & keys2)

        tqdm.pandas(desc="Processing test_df")
        test_df['com_attrs'] = test_df.progress_apply(
            lambda row: compute_com_attrs(row['variantid1'], row['variantid2']), axis=1)

        return test_df


    test = com_attrs(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 41794.92it/s]


In [14]:
if CONFIG.get('jac_vals', False):
    def jac_vals(test_df, attributes_df):
        
        def compute_jac_vals(variantid1, variantid2):
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))

            if not common_keys:
                return -2  

            jaccard_scores = []

            for key in common_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing test_df")
        test_df['jac_vals'] = test_df.progress_apply(lambda row: compute_jac_vals(row['variantid1'], row['variantid2']), axis=1)

        return test_df

    test = jac_vals(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 31132.70it/s]


In [15]:
if CONFIG.get('com_vals', False):    
    def com_vals(test_df, attributes_df):
        
        def compute_com_vals(variantid1, variantid2):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))
    
            if not common_keys:
                return -2  
    
            scores = 0
    
            for key in common_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                if set_values1 == set_values2:
                    scores += 1
            
            return scores
        
        tqdm.pandas(desc="Processing test_df")
        test_df['com_vals'] = test_df.progress_apply(lambda row: compute_com_vals(row['variantid1'], row['variantid2']), axis=1)
    
        return test_df
    
    test = com_vals(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 35865.90it/s]


In [16]:
if CONFIG.get('jac_num_vals', False):
    def jac_num_vals(test_df, attributes_df):
        
        def compute_jac_num_vals(variantid1, variantid2):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))
            
            if not common_keys:
                return -2  
            
            num_common_keys = set()
            
            for key in common_keys:
                if len(attrs1[key]) == len(attrs2[key]):
                    digit = True
                    for val1, val2 in zip(attrs1[key], attrs2[key]):
                        if bool(re.match(r'^[-+]?\d*\.?\d+([eE][-+]?\d+)?$', val1)) and bool(re.match(r'^[-+]?\d*\.?\d+([eE][-+]?\d+)?$', val1)):
                            continue
                        else:
                            digit = False
                    if digit:
                        num_common_keys.add(key)
            
            if len(num_common_keys) == 0:
                return -3
            
            jaccard_scores = []

            for key in num_common_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing test_df")
        test_df['jac_num_vals'] = test_df.progress_apply(
            lambda row: compute_jac_num_vals(row['variantid1'], row['variantid2']), axis=1)

        return test_df

    test = jac_num_vals(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 26453.30it/s]


In [17]:
if CONFIG.get('rat_num_vals', False):    
    def rat_num_vals(test_df, attributes_df):
        
        def compute_rat_num_vals(variantid1, variantid2):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))
            
            if not common_keys:
                return -2  
            
            num_common_keys = set()
            
            for key in common_keys:
                if len(attrs1[key]) == len(attrs2[key]):
                    digit = True
                    for val1, val2 in zip(attrs1[key], attrs2[key]):
                        if bool(re.match(r'^[-+]?\d*\.?\d+([eE][-+]?\d+)?$', val1)) and bool(re.match(r'^[-+]?\d*\.?\d+([eE][-+]?\d+)?$', val2)):
                            continue
                        else:
                            digit = False
                    if digit:
                        num_common_keys.add(key)
            
            if len(num_common_keys) == 0:
                return -3
            
            rat_scores = []
    
            for key in num_common_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
                
                if len(set_values1) == 1 and len(set_values2) == 1:
                    val1 = float(list(set_values1)[0])
                    val2 = float(list(set_values2)[0])
                    
                    if val1 == val2:
                        rat_scores.append(1.0)  # Значения одинаковы
                    else:
                        # Используем абсолютное значение разности
                        abs_diff = abs(val1 - val2)
                        # Нормируем разницу, чтобы она находилась в диапазоне от 0 до 1
                        # В этом случае устанавливаем максимальное значение для нормализации.
                        max_diff = max(abs(val1), abs(val2))
                        normalized_diff = 1 - (abs_diff / (max_diff if max_diff != 0 else 1))
                        rat_scores.append(normalized_diff)
                
                else:
                    intersection = len(set_values1.intersection(set_values2))
                    union = len(set_values1.union(set_values2))
                    
                    jaccard_score = intersection / union if union != 0 else 0
                    
                    rat_scores.append(jaccard_score)
            
            return sum(rat_scores) / len(rat_scores)
    
        tqdm.pandas(desc="Processing test_df")
        test_df['rat_num_vals'] = test_df.progress_apply(
            lambda row: compute_rat_num_vals(row['variantid1'], row['variantid2']), axis=1)
    
        return test_df
    
    test = rat_num_vals(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 25525.73it/s]


In [18]:
if CONFIG.get('jac_sev_vals', False):
    def jac_sev_vals(test_df, attributes_df):
        
        def compute_jac_sev_vals(variantid1, variantid2):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))
            
            if not common_keys:
                return -2  
            
            sev_common_keys = set()
            
            for key in common_keys:
                if len(attrs1[key]) > 1 or len(attrs2[key]) > 1:
                        sev_common_keys.add(key)
            
            if len(sev_common_keys) == 0:
                return -3
            
            jaccard_scores = []

            for key in sev_common_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing test_df")
        test_df['jac_sev_vals'] = test_df.progress_apply(
            lambda row: compute_jac_sev_vals(row['variantid1'], row['variantid2']), axis=1)

        return test_df

    test = jac_sev_vals(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 37502.33it/s]


In [19]:
if CONFIG.get('jac_vital_vals', False):
    def jac_vital_vals(test_df, attributes_df):
        
        def compute_jac_vital_vals(variantid1, variantid2, v_keys=vital_keys):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))

            if not common_keys:
                return -2  
            
            common_vital_keys = set()
            
            for key in common_keys:
                if key in v_keys:
                    common_vital_keys.add(key)
            
            if len(common_vital_keys) == 0:
                return -3
                    
            jaccard_scores = []

            for key in common_vital_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing test_df")
        test_df['jac_vital_vals'] = test_df.progress_apply(
            lambda row: compute_jac_vital_vals(row['variantid1'], row['variantid2']), axis=1)

        return test_df

    test = jac_vital_vals(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 30649.64it/s]


In [20]:
if CONFIG.get('jac_minor_vals', False):
    def jac_minor_vals(test_df, attributes_df):
        
        def compute_jac_minor_vals(variantid1, variantid2, m_keys=minor_keys):
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))

            if not common_keys:
                return -2  
            
            common_minor_keys = set()
            
            for key in common_keys:
                if key in m_keys:
                    common_minor_keys.add(key)
            
            if len(common_minor_keys) == 0:
                return -3
                    
            jaccard_scores = []

            for key in common_minor_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing test_df")
        test_df['jac_minor_vals'] = test_df.progress_apply(lambda row: compute_jac_minor_vals(row['variantid1'], row['variantid2']), axis=1)

        return test_df

    test = jac_minor_vals(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 30278.51it/s]


In [21]:
if CONFIG.get('diff_attrs', False):
    def diff_attrs(test_df, attributes_df):
        
        def compute_diff_attrs(variantid1, variantid2):
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            return (abs(len(attrs1) - len(attrs2))) / max(len(attrs1), len(attrs2))

        tqdm.pandas(desc="Processing test_df")
        test_df['diff_attrs'] = test_df.progress_apply(lambda row: compute_diff_attrs(row['variantid1'], row['variantid2']), axis=1)

        return test_df
    
    test = diff_attrs(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:02<00:00, 22540.36it/s]


In [22]:
if CONFIG.get('n_len_diff', False):
    def n_len_diff(test_df, text_df):
        
        def compute_n_len_diff(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']
                
            if len(name1) < 3  or len(name2) < 3 :
                if len(name1) < 3  and len(name2) < 3 :
                    return -2
                else:
                    return -1
            
            return abs(len(name1) - len(name2)) / max(len(name1), len(name2))

        tqdm.pandas(desc="Processing test_df")
        test_df['n_len_diff'] = test_df.progress_apply(lambda row: compute_n_len_diff(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df

    test = n_len_diff(test, text)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 32396.58it/s]


In [23]:
if CONFIG.get('d_len_diff', False):
    def d_len_diff(test_df, text_df):
        
        def compute_d_len_diff(variantid1, variantid2):
            
            desc1 = text_df.at[variantid1, 'description']
            desc2 = text_df.at[variantid2, 'description']
            
            if desc1 is None or desc2 is None:
                if desc1 is None and desc2 is None:
                    return -2
                else:
                    return -1
                
            if len(desc1.replace(' ', '')) == 0  or len(desc2.replace(' ', '')) == 0:
                if len(desc1.replace(' ', '')) == 0  and len(desc2.replace(' ', '')) == 0:
                    return -2
                else:
                    return -1
                
            return abs(len(desc1) - len(desc2)) / max(len(desc1), len(desc2))

        tqdm.pandas(desc="Processing test_df")
        test_df['d_len_diff'] = test_df.progress_apply(lambda row: compute_d_len_diff(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df

    test = d_len_diff(test, text)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 29691.89it/s]


In [24]:
if CONFIG.get('n_lev', False):
    def n_lev(test_df, text_df):
        
        def compute_n_lev(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']

            if len(name1) < 3  or len(name2) < 3 :
                if len(name1) < 3  and len(name2) < 3 :
                    return -2
                else:
                    return -1
                
            return Levenshtein.distance(name1, name2) / max(len(name1), len(name2))

        tqdm.pandas(desc="Processing test_df")
        test_df['n_lev'] = test_df.progress_apply(lambda row: compute_n_lev(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
    
    test = n_lev(test, text)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 35445.17it/s]


In [25]:
if CONFIG.get('d_lev', False):
    def d_lev(test_df, text_df):
        
        def compute_d_lev(variantid1, variantid2):
            
            desc1 = text_df.at[variantid1, 'description']
            desc2 = text_df.at[variantid2, 'description']
            
            if desc1 is None or desc2 is None:
                if desc1 is None and desc2 is None:
                    return -2
                else:
                    return -1
            
            if len(desc1.replace(' ', '')) == 0  or len(desc2.replace(' ', '')) == 0:
                if len(desc1.replace(' ', '')) == 0  and len(desc2.replace(' ', '')) == 0:
                    return -2
                else:
                    return -1
                
            return Levenshtein.distance(desc1, desc2) / max(len(desc1), len(desc2))
    
        tqdm.pandas(desc="Processing test_df")
        test_df['d_lev'] = test_df.progress_apply(lambda row: compute_d_lev(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
    
    test = d_lev(test, text)

Processing test_df: 100%|██████████| 49620/49620 [00:03<00:00, 12633.10it/s]


In [26]:
if CONFIG.get('n_jac_symbs', False):
    def n_jac_symbs(test_df, text_df):
        
        def compute_n_jac_symbs(variantid1, variantid2):
            
            symbols1 = set(text_df.at[variantid1, 'name'])
            symbols2 = set(text_df.at[variantid2, 'name'])
            
            if len(symbols1) == 0 or len(symbols2) == 0:
                if len(symbols1) == 0 and len(symbols2) == 0:
                    return -2
                else:
                    return -1
            
            intersection = len(symbols1.intersection(symbols2))
            union = len(symbols1.union(symbols2))
            
            return intersection / union if union != 0 else 0

        tqdm.pandas(desc="Processing test_df")
        test_df['n_jac_symbs'] = test_df.progress_apply(lambda row: compute_n_jac_symbs(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
    
    test = n_jac_symbs(test, text)

Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 26548.22it/s]


In [27]:
if CONFIG.get('d_jac_symbs', False):
    def d_jac_symbs(test_df, text_df):
        
        def compute_d_jac_symbs(variantid1, variantid2):
            
            desc1 = text_df.at[variantid1, 'description']
            desc2 = text_df.at[variantid2, 'description']
            
            if desc1 is None or desc2 is None:
                if desc1 is None and desc2 is None:
                    return -2
                else:
                    return -1
            
            symbols1 = set(desc1)
            symbols2 = set(desc2)
            
            if len(symbols1) == 0 or len(symbols2) == 0:
                if len(symbols1) == 0 and len(symbols2) == 0: 
                    return -2
                else:
                    return -1
            
            intersection = len(symbols1.intersection(symbols2))
            union = len(symbols1.union(symbols2))
            
            return intersection / union if union != 0 else 0

        tqdm.pandas(desc="Processing test_df")
        test_df['d_jac_symbs'] = test_df.progress_apply(lambda row: compute_d_jac_symbs(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
    
    test = d_jac_symbs(test, text)

Processing test_df: 100%|██████████| 49620/49620 [00:04<00:00, 10770.15it/s]


In [28]:
if CONFIG.get('n_jac', False):
    def n_jac(test_df, text_df):
        
        def create_bigrams(sentence):
            bigrams = []
            for i in range(len(sentence) - 1):
                bigrams.append((sentence[i], sentence[i + 1]))
            return bigrams
        
        def filter_bigrams(bigrams):
            return [(w1, w2) for (w1, w2) in bigrams if re.match(r'\d+', w1)]
        
        def compute_jac_value(tokens1, tokens2):
            
            if len(tokens1) == 0  or len(tokens2) == 0 :
                if len(tokens1) == 0  and len(tokens2) == 0 :
                    return -4
                else:
                    return -3
            
            intersection = len(tokens1.intersection(tokens2))
            union = len(tokens1.union(tokens2))
            
            return intersection / union if union > 0 else 0
        
        def compute_jacm_value(tokens1, tokens2):
            
            if len(tokens1) == 0  or len(tokens2) == 0 :
                if len(tokens1) == 0  and len(tokens2) == 0 :
                    return -4
                else:
                    return -3
            
            intersection = len(tokens1.intersection(tokens2))
            # Доля общих элементов относительно множества 1
            ratio1 = intersection / len(tokens1)
            
            # Доля общих элементов относительно множества 2
            ratio2 = intersection / len(tokens2)
            
            # Возвращаем максимальное значение из двух долей
            return max(ratio1, ratio2)
        
        def compute_n_jac(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']
            
            if name1 is None:
                name1 = ''
            if name2 is None:
                name2 = ''
            
            if len(name1) < 3  or len(name2) < 3 :
                if len(name1) < 3  and len(name2) < 3 :
                    return -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
                else:
                    return -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
                
            # Токены без пунктуации
            words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            words1 = words1.replace('_', ' ') # Удаление _ символов
            words1 = re.sub(r'\s+', ' ', words1) # Удаление лишних пробелов
            words1 = words1.strip().split() # Удаление пробелов в начале и конце    
            #
            words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            words2 = words2.replace('_', ' ') # Удаление _ символов
            words2 = re.sub(r'\s+', ' ', words2) # Удаление лишних пробелов
            words2 = words2.strip().split() # Удаление пробелов в начале и конце    
            #
            n_jac_words = compute_jac_value(set(words1), set(words2))
            n_jacm_words = compute_jacm_value(set(words1), set(words2))
            
            
            ru_words1 = re.sub(r'[^а-яА-Я]', ' ', name1) # Получение токенов с ру символами
            ru_words1 = re.sub(r'\s+', ' ', ru_words1) # Удаление лишних пробелов
            ru_words1 = ru_words1.strip().split() # Удаление пробелов в начале и конце
            #
            ru_words2 = re.sub(r'[^а-яА-Я]', ' ', name2) # Получение токенов с ру символами
            ru_words2 = re.sub(r'\s+', ' ', ru_words2) # Удаление лишних пробелов
            ru_words2 = ru_words2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_ru_words = compute_jac_value(set(ru_words1), set(ru_words2))
            n_jacm_ru_words = compute_jacm_value(set(ru_words1), set(ru_words2))
            
            
            en_words1 = re.sub(r'[^a-zA-Z]', ' ', name1) # Получение токенов с ру символами
            en_words1 = re.sub(r'\s+', ' ', en_words1) # Удаление лишних пробелов
            en_words1 = en_words1.strip().split() # Удаление пробелов в начале и конце
            #
            en_words2 = re.sub(r'[^a-zA-Z]', ' ', name2) # Получение токенов с ру символами
            en_words2 = re.sub(r'\s+', ' ', en_words2) # Удаление лишних пробелов
            en_words2 = en_words2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_en_words = compute_jac_value(set(en_words1), set(en_words2))
            n_jacm_en_words = compute_jacm_value(set(en_words1), set(en_words2))
            
            
            ru_comb_words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            ru_comb_words1 = ru_comb_words1.replace('_', ' ') # Удаление _ символов
            ru_comb_words1 = ' '.join(re.findall(r'\b(?=\w*[а-яА-Я])(?=\w*[^\Wа-яА-Я])\w+\b', ru_comb_words1))
            ru_comb_words1 = re.sub(r'\s+', ' ', ru_comb_words1) # Удаление лишних пробелов
            ru_comb_words1 = ru_comb_words1.strip().split() # Удаление пробелов в начале и конце
            #
            ru_comb_words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            ru_comb_words2 = ru_comb_words2.replace('_', ' ') # Удаление _ символов
            ru_comb_words2 = ' '.join(re.findall(r'\b(?=\w*[а-яА-Я])(?=\w*[^\Wа-яА-Я])\w+\b', ru_comb_words2))
            ru_comb_words2 = re.sub(r'\s+', ' ', ru_comb_words2) # Удаление лишних пробелов
            ru_comb_words2 = ru_comb_words2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_ru_comb_words = compute_jac_value(set(ru_comb_words1), set(ru_comb_words2))
            n_jacm_ru_comb_words = compute_jacm_value(set(ru_comb_words1), set(ru_comb_words2))
            
            
            en_comb_words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            en_comb_words1 = en_comb_words1.replace('_', ' ') # Удаление _ символов
            en_comb_words1 = ' '.join(re.findall(r'\b(?=\w*[a-zA-Z])(?=\w*[^\Wa-zA-Z])\w+\b', en_comb_words1))
            en_comb_words1 = re.sub(r'\s+', ' ', en_comb_words1) # Удаление лишних пробелов
            en_comb_words1 = en_comb_words1.strip().split() # Удаление пробелов в начале и конце
            #
            en_comb_words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            en_comb_words2 = en_comb_words2.replace('_', ' ') # Удаление _ символов
            en_comb_words2 = ' '.join(re.findall(r'\b(?=\w*[a-zA-Z])(?=\w*[^\Wa-zA-Z])\w+\b', en_comb_words2))
            en_comb_words2 = re.sub(r'\s+', ' ', en_comb_words2) # Удаление лишних пробелов
            en_comb_words2 = en_comb_words2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_en_comb_words = compute_jac_value(set(en_comb_words1), set(en_comb_words2))
            n_jacm_en_comb_words = compute_jacm_value(set(en_comb_words1), set(en_comb_words2))
            
            
            numbers1 = re.sub(r'[^\d]', ' ', name1) # Получение токенов с цифрами
            numbers1 = re.sub(r'\s+', ' ', numbers1) # Удаление лишних пробелов
            numbers1 = numbers1.strip().split() # Удаление пробелов в начале и конце
            #
            numbers2 = re.sub(r'[^\d]', ' ', name2) # Получение токенов с цифрами
            numbers2 = re.sub(r'\s+', ' ', numbers2) # Удаление лишних пробелов
            numbers2 = numbers2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_numbers = compute_jac_value(set(numbers1), set(numbers2))
            n_jacm_numbers = compute_jacm_value(set(numbers1), set(numbers2))
            
        
            bigrams1 = create_bigrams(words1)
            bigrams2 = create_bigrams(words2)
            #
            filtered_bigrams1 = filter_bigrams(bigrams1)
            filtered_bigrams2 = filter_bigrams(bigrams2)
            #
            length = 0
            counter = 0
            #
            for w1, w2 in filtered_bigrams1:
                for x1, x2 in filtered_bigrams2:
                    if w2 == x2:
                        length += 1
                        if w1 == x1:
                            counter += 1
            #
            if length != 0:
                num_words = counter / length
            else:
                num_words = -3
            
            words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            words1 = words1.replace('_', ' ') # Удаление _ символов
            words1 = ' '.join(re.findall(r'\b\d{4}\b', words1))
            words1 = re.sub(r'\s+', ' ', words1) # Удаление лишних пробелов
            words1 = set(words1.strip().split()) # Удаление пробелов в начале и конце
            #
            words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            words2 = words2.replace('_', ' ') # Удаление _ символов
            words2 = ' '.join(re.findall(r'\b\d{4}\b', words2))
            words2 = re.sub(r'\s+', ' ', words2) # Удаление лишних пробелов
            words2 = set(words2.strip().split()) # Удаление пробелов в начале и конце
            #
            four_digits = compute_jac_value(words1, words2)
            mfour_digits = compute_jacm_value(words1, words2)
            
            matches1 = re.findall(r"[+-]?\d+(?:[.,]\d+)?", name1)
            matches2 = re.findall(r"[+-]?\d+(?:[.,]\d+)?", name2)
            #
            floats = compute_jac_value(set(matches1), set(matches2))
            mfloats = compute_jac_value(set(matches1), set(matches2))
            
            return n_jac_words, n_jac_ru_words, n_jac_en_words, n_jac_ru_comb_words, n_jac_en_comb_words, n_jac_numbers, num_words, four_digits, floats, n_jacm_words, n_jacm_ru_words, n_jacm_en_words, n_jacm_ru_comb_words, n_jacm_en_comb_words, n_jacm_numbers, mfour_digits, mfloats
    

        tqdm.pandas(desc='Processing test')
        test_df[['n_jac_words', 'n_jac_ru_words', 'n_jac_en_words', 'n_jac_ru_comb_words', 'n_jac_en_comb_words', 'n_jac_numbers', 'n_num_words', 'n_four_digits', 'n_floats', 'n_jacm_words', 'n_jacm_ru_words', 'n_jacm_en_words', 'n_jacm_ru_comb_words', 'n_jacm_en_comb_words', 'n_jacm_numbers', 'n_mfour_digits', 'n_mfloats']] = test_df.progress_apply(lambda row: pd.Series(compute_n_jac(row['variantid1'], row['variantid2'])), axis=1)
        
        return test_df
    
    test = n_jac(test, text)

Processing test: 100%|██████████| 49620/49620 [00:14<00:00, 3410.28it/s]


In [29]:
if CONFIG.get('d_jac', False):
    def d_jac(test_df, text_df):
        
        def create_bigrams(sentence):
            bigrams = []
            for i in range(len(sentence) - 1):
                bigrams.append((sentence[i], sentence[i + 1]))
            return bigrams
        
        def filter_bigrams(bigrams):
            return [(w1, w2) for (w1, w2) in bigrams if re.match(r'\d+', w1)]
        
        def compute_jac_value(tokens1, tokens2):
            
            if len(tokens1) == 0  or len(tokens2) == 0 :
                if len(tokens1) == 0  and len(tokens2) == 0 :
                    return -4
                else:
                    return -3
            
            intersection = len(tokens1.intersection(tokens2))
            union = len(tokens1.union(tokens2))
            
            return intersection / union if union > 0 else 0
        
        def compute_jacm_value(tokens1, tokens2):
            
            if len(tokens1) == 0  or len(tokens2) == 0 :
                if len(tokens1) == 0  and len(tokens2) == 0 :
                    return -4
                else:
                    return -3
            
            intersection = len(tokens1.intersection(tokens2))
            # Доля общих элементов относительно множества 1
            ratio1 = intersection / len(tokens1)
            
            # Доля общих элементов относительно множества 2
            ratio2 = intersection / len(tokens2)
            
            # Возвращаем максимальное значение из двух долей
            return max(ratio1, ratio2)
        
        def compute_d_jac(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'description']
            name2 = text_df.at[variantid2, 'description']
            
            if name1 is None:
                name1 = ''
            if name2 is None:
                name2 = ''
            
            if len(name1) < 3  or len(name2) < 3 :
                if len(name1) < 3  and len(name2) < 3 :
                    return -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
                else:
                    return -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
                
            # Токены без пунктуации
            words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            words1 = words1.replace('_', ' ') # Удаление _ символов
            words1 = re.sub(r'\s+', ' ', words1) # Удаление лишних пробелов
            words1 = words1.strip().split() # Удаление пробелов в начале и конце    
            #
            words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            words2 = words2.replace('_', ' ') # Удаление _ символов
            words2 = re.sub(r'\s+', ' ', words2) # Удаление лишних пробелов
            words2 = words2.strip().split() # Удаление пробелов в начале и конце    
            #
            n_jac_words = compute_jac_value(set(words1), set(words2))
            n_jacm_words = compute_jacm_value(set(words1), set(words2))
            
            
            ru_words1 = re.sub(r'[^а-яА-Я]', ' ', name1) # Получение токенов с ру символами
            ru_words1 = re.sub(r'\s+', ' ', ru_words1) # Удаление лишних пробелов
            ru_words1 = ru_words1.strip().split() # Удаление пробелов в начале и конце
            #
            ru_words2 = re.sub(r'[^а-яА-Я]', ' ', name2) # Получение токенов с ру символами
            ru_words2 = re.sub(r'\s+', ' ', ru_words2) # Удаление лишних пробелов
            ru_words2 = ru_words2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_ru_words = compute_jac_value(set(ru_words1), set(ru_words2))
            n_jacm_ru_words = compute_jacm_value(set(ru_words1), set(ru_words2))
            
            
            en_words1 = re.sub(r'[^a-zA-Z]', ' ', name1) # Получение токенов с ру символами
            en_words1 = re.sub(r'\s+', ' ', en_words1) # Удаление лишних пробелов
            en_words1 = en_words1.strip().split() # Удаление пробелов в начале и конце
            #
            en_words2 = re.sub(r'[^a-zA-Z]', ' ', name2) # Получение токенов с ру символами
            en_words2 = re.sub(r'\s+', ' ', en_words2) # Удаление лишних пробелов
            en_words2 = en_words2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_en_words = compute_jac_value(set(en_words1), set(en_words2))
            n_jacm_en_words = compute_jacm_value(set(en_words1), set(en_words2))
            
            
            ru_comb_words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            ru_comb_words1 = ru_comb_words1.replace('_', ' ') # Удаление _ символов
            ru_comb_words1 = ' '.join(re.findall(r'\b(?=\w*[а-яА-Я])(?=\w*[^\Wа-яА-Я])\w+\b', ru_comb_words1))
            ru_comb_words1 = re.sub(r'\s+', ' ', ru_comb_words1) # Удаление лишних пробелов
            ru_comb_words1 = ru_comb_words1.strip().split() # Удаление пробелов в начале и конце
            #
            ru_comb_words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            ru_comb_words2 = ru_comb_words2.replace('_', ' ') # Удаление _ символов
            ru_comb_words2 = ' '.join(re.findall(r'\b(?=\w*[а-яА-Я])(?=\w*[^\Wа-яА-Я])\w+\b', ru_comb_words2))
            ru_comb_words2 = re.sub(r'\s+', ' ', ru_comb_words2) # Удаление лишних пробелов
            ru_comb_words2 = ru_comb_words2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_ru_comb_words = compute_jac_value(set(ru_comb_words1), set(ru_comb_words2))
            n_jacm_ru_comb_words = compute_jacm_value(set(ru_comb_words1), set(ru_comb_words2))
            
            
            en_comb_words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            en_comb_words1 = en_comb_words1.replace('_', ' ') # Удаление _ символов
            en_comb_words1 = ' '.join(re.findall(r'\b(?=\w*[a-zA-Z])(?=\w*[^\Wa-zA-Z])\w+\b', en_comb_words1))
            en_comb_words1 = re.sub(r'\s+', ' ', en_comb_words1) # Удаление лишних пробелов
            en_comb_words1 = en_comb_words1.strip().split() # Удаление пробелов в начале и конце
            #
            en_comb_words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            en_comb_words2 = en_comb_words2.replace('_', ' ') # Удаление _ символов
            en_comb_words2 = ' '.join(re.findall(r'\b(?=\w*[a-zA-Z])(?=\w*[^\Wa-zA-Z])\w+\b', en_comb_words2))
            en_comb_words2 = re.sub(r'\s+', ' ', en_comb_words2) # Удаление лишних пробелов
            en_comb_words2 = en_comb_words2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_en_comb_words = compute_jac_value(set(en_comb_words1), set(en_comb_words2))
            n_jacm_en_comb_words = compute_jacm_value(set(en_comb_words1), set(en_comb_words2))
            
            
            numbers1 = re.sub(r'[^\d]', ' ', name1) # Получение токенов с цифрами
            numbers1 = re.sub(r'\s+', ' ', numbers1) # Удаление лишних пробелов
            numbers1 = numbers1.strip().split() # Удаление пробелов в начале и конце
            #
            numbers2 = re.sub(r'[^\d]', ' ', name2) # Получение токенов с цифрами
            numbers2 = re.sub(r'\s+', ' ', numbers2) # Удаление лишних пробелов
            numbers2 = numbers2.strip().split() # Удаление пробелов в начале и конце
            #
            n_jac_numbers = compute_jac_value(set(numbers1), set(numbers2))
            n_jacm_numbers = compute_jacm_value(set(numbers1), set(numbers2))
            
        
            bigrams1 = create_bigrams(words1)
            bigrams2 = create_bigrams(words2)
            #
            filtered_bigrams1 = filter_bigrams(bigrams1)
            filtered_bigrams2 = filter_bigrams(bigrams2)
            #
            length = 0
            counter = 0
            #
            for w1, w2 in filtered_bigrams1:
                for x1, x2 in filtered_bigrams2:
                    if w2 == x2:
                        length += 1
                        if w1 == x1:
                            counter += 1
            #
            if length != 0:
                num_words = counter / length
            else:
                num_words = -3
            
            words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            words1 = words1.replace('_', ' ') # Удаление _ символов
            words1 = ' '.join(re.findall(r'\b\d{4}\b', words1))
            words1 = re.sub(r'\s+', ' ', words1) # Удаление лишних пробелов
            words1 = set(words1.strip().split()) # Удаление пробелов в начале и конце
            #
            words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            words2 = words2.replace('_', ' ') # Удаление _ символов
            words2 = ' '.join(re.findall(r'\b\d{4}\b', words2))
            words2 = re.sub(r'\s+', ' ', words2) # Удаление лишних пробелов
            words2 = set(words2.strip().split()) # Удаление пробелов в начале и конце
            #
            four_digits = compute_jac_value(words1, words2)
            mfour_digits = compute_jacm_value(words1, words2)
            
            matches1 = re.findall(r"[+-]?\d+(?:[.,]\d+)?", name1)
            matches2 = re.findall(r"[+-]?\d+(?:[.,]\d+)?", name2)
            #
            floats = compute_jac_value(set(matches1), set(matches2))
            mfloats = compute_jac_value(set(matches1), set(matches2))
            
            return n_jac_words, n_jac_ru_words, n_jac_en_words, n_jac_ru_comb_words, n_jac_en_comb_words, n_jac_numbers, num_words, four_digits, floats, n_jacm_words, n_jacm_ru_words, n_jacm_en_words, n_jacm_ru_comb_words, n_jacm_en_comb_words, n_jacm_numbers, mfour_digits, mfloats

        tqdm.pandas(desc='Processing test')
        test_df[['d_jac_words', 'd_jac_ru_words', 'd_jac_en_words', 'd_jac_ru_comb_words', 'd_jac_en_comb_words', 'd_jac_numbers', 'd_num_words', 'd_four_digits', 'd_floats', 'd_jacm_words', 'd_jacm_ru_words', 'd_jacm_en_words', 'd_jacm_ru_comb_words', 'd_jacm_en_comb_words', 'd_jacm_numbers', 'd_mfour_digits', 'd_mfloats']] = test_df.progress_apply(lambda row: pd.Series(compute_d_jac(row['variantid1'], row['variantid2'])), axis=1)
        
        return test_df
    
    test = d_jac(test, text)

Processing test: 100%|██████████| 49620/49620 [00:40<00:00, 1221.46it/s]


In [30]:
if CONFIG.get('n_lev_opers', False):
    def n_lev_opers(test_df, text_df):
        
        def compute_n_lev_opers(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']
            
            if len(name1) < 3  or len(name2) < 3 :
                if len(name1) < 3  and len(name2) < 3 :
                    return -2, -2
                else:
                    return -1, -1
                
            operations = Levenshtein.editops(name1, name2)
            
            insertions = sum(1 for op in operations if op[0] == 'insert')
            deletions = sum(1 for op in operations if op[0] == 'delete')
            replaces = sum(1 for op in operations if op[0] == 'replace')
            
            n_lev_var = insertions + deletions
            
            max_len = max(len(name1), len(name2))
            
            return n_lev_var / max_len, replaces / max_len

        tqdm.pandas(desc="Processing test_df")
        test_df[['n_lev_var', 'n_lev_rep']] = test_df.progress_apply(
            lambda row: pd.Series(compute_n_lev_opers(row['variantid1'], row['variantid2'])), axis=1)
        
        return test_df
    
    test = n_lev_opers(test, text)

Processing test_df: 100%|██████████| 49620/49620 [00:03<00:00, 13297.22it/s]


In [31]:
if CONFIG.get('d_lev_opers', False):
    def d_lev_opers(test_df, text_df):
        
        def compute_d_lev_opers(variantid1, variantid2):
            
            desc1 = text_df.at[variantid1, 'description']
            desc2 = text_df.at[variantid2, 'description']
            
            if desc1 is None or desc2 is None:
                if desc1 is None and desc2 is None:
                    return -2, -2
                else:
                    return -1, -1
            
            if len(desc1.replace(' ', '')) == 0  or len(desc2.replace(' ', ''))  == 0:
                if len(desc1.replace(' ', '')) == 0  and len(desc2.replace(' ', ''))  == 0:
                    return -2, -2
                else:
                    return -1, -1
                
            operations = Levenshtein.editops(desc1, desc2)
            
            insertions = sum(1 for op in operations if op[0] == 'insert')
            deletions = sum(1 for op in operations if op[0] == 'delete')
            replaces = sum(1 for op in operations if op[0] == 'replace')
            
            d_lev_var = insertions + deletions
            
            max_len = max(len(desc1), len(desc2))
            
            return d_lev_var / max_len, replaces / max_len
    
        tqdm.pandas(desc="Processing test_df")
        test_df[['d_lev_var', 'd_lev_rep']] = test_df.progress_apply(
            lambda row: pd.Series(compute_d_lev_opers(row['variantid1'], row['variantid2'])), axis=1)
        
        return test_df
    
    test = d_lev_opers(test, text)

Processing test_df: 100%|██████████| 49620/49620 [00:08<00:00, 5699.66it/s]


In [32]:
if CONFIG.get('m_cos', False):
    def m_cos(test_df, resnet_df):
        
        def compute_m_cos(variantid1, variantid2):
            
            embedding_1 = resnet_df.at[variantid1, 'main_pic_embeddings_resnet_v1'][0]
            embedding_2 = resnet_df.at[variantid2, 'main_pic_embeddings_resnet_v1'][0]
            
            return cosine_similarity([embedding_1], [embedding_2])[0,0]

        tqdm.pandas(desc="Processing test_df")
        test_df['m_cos'] = test_df.progress_apply(lambda row: compute_m_cos(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
        
    test = m_cos(test, resnet)

Processing test_df: 100%|██████████| 49620/49620 [00:09<00:00, 5362.33it/s]


In [33]:
if CONFIG.get('m_evklid', False):
    def m_evklid(test_df, resnet_df):
        
        def compute_m_evklid(variantid1, variantid2):
            
            embedding_1 = resnet_df.at[variantid1, 'main_pic_embeddings_resnet_v1'][0]
            embedding_2 = resnet_df.at[variantid2, 'main_pic_embeddings_resnet_v1'][0]
            
            return euclidean_distances([embedding_1], [embedding_2])[0, 0]

        tqdm.pandas(desc="Processing test_df")
        test_df['m_evklid'] = test_df.progress_apply(lambda row: compute_m_evklid(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
        
    test = m_evklid(test, resnet)

Processing test_df: 100%|██████████| 49620/49620 [00:06<00:00, 7328.69it/s]


In [34]:
if CONFIG.get('e_jac', False):
    def e_jac(test_df, resnet_df):
        
        def calculate_e_jac(v1, v2):
            embs1 = resnet_df.at[v1, 'pic_embeddings_resnet_v1']
            embs2 = resnet_df.at[v2, 'pic_embeddings_resnet_v1'] 
            
            if embs1 is None or embs2 is None:
                if embs1 is None and embs2 is None:
                    return -2
                else:
                    return -1
            
            set1 = set(tuple(emb) for emb in embs1)
            set2 = set(tuple(emb) for emb in embs2)
            
            intersection = set1.intersection(set2)
            union = set1.union(set2)
            
            if not union:
                return 0

            return len(intersection) / len(union)

        tqdm.pandas(desc="Processing test_df")
        test_df['e_jac'] = test_df.progress_apply(lambda row: calculate_e_jac(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
        
    test = e_jac(test, resnet)

Processing test_df: 100%|██████████| 49620/49620 [00:07<00:00, 6823.65it/s]


In [35]:
if CONFIG.get('e_diff', False):
    def e_diff(test_df, resnet_df):
        
        def count_e_diff(variantid1, variantid2):
            emb1 = resnet_df.at[variantid1, 'pic_embeddings_resnet_v1']
            emb2 = resnet_df.at[variantid2, 'pic_embeddings_resnet_v1']
            
            if emb1 is None or emb2 is None:
                if emb1 is None and emb2 is None:
                    return -2
                else:
                    return -1
            
            return abs(len(emb1) - len(emb2)) / max(len(emb1), len(emb2))

        tqdm.pandas(desc="Processing test_df")
        test_df['e_diff'] = test_df.progress_apply(lambda row: count_e_diff(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
        
    test = e_diff(test, resnet)

Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 88332.13it/s]


In [36]:
if CONFIG.get('m_ent_diff', False):
    def m_ent_diff(test_df, resnet_df):
        
        def compute_entropy_diff(embedding):
            
            norm_embedding = np.abs(embedding) / np.sum(np.abs(embedding))
            
            return entropy(norm_embedding)
    
        def compute_m_ent_diff(variantid1, variantid2):
            embedding1 = resnet_df.at[variantid1, 'main_pic_embeddings_resnet_v1'][0]
            embedding2 = resnet_df.at[variantid2, 'main_pic_embeddings_resnet_v1'][0]
    
            entropy1 = compute_entropy_diff(embedding1)
            entropy2 = compute_entropy_diff(embedding2)
            
            return abs(entropy1 - entropy2)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['m_ent_diff'] = test_df.progress_apply(lambda row: compute_m_ent_diff(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
    
    test = m_ent_diff(test, resnet)

Processing test_df: 100%|██████████| 49620/49620 [00:14<00:00, 3394.99it/s]


In [37]:
if CONFIG.get('e_avg_cos', False):
    def e_avg_cos(test_df, resnet_df):
        
        def average_embedding(embeddings):
            return np.mean(np.array(embeddings, copy=False).copy(), axis=0)


        def calculate_e_avg_cos(v1, v2):
            emb1 = resnet_df.at[v1, 'pic_embeddings_resnet_v1']
            emb2 = resnet_df.at[v2, 'pic_embeddings_resnet_v1'] 
            
            if emb1 is None or emb2 is None:
                if emb1 is None and emb2 is None:
                    return -2
                else:
                    return -1
            
            return cosine_similarity([average_embedding(emb1)], [average_embedding(emb2)])[0, 0]

        tqdm.pandas(desc="Processing test_df")
        test_df['e_avg_cos'] = test_df.progress_apply(lambda row: calculate_e_avg_cos(row['variantid1'], row['variantid2']), axis=1)
        
        return test_df
        
    test = e_avg_cos(test, resnet)

Processing test_df: 100%|██████████| 49620/49620 [00:07<00:00, 7077.00it/s]


In [39]:
with open('my_data/top_attributes_per_category.pkl', 'rb') as f:
    top_attributes_per_category = pickle.load(f)

In [40]:
n_samples = 300
def attrs300(test_df, attributes_df):
        
    # Функция для вычисления значений атрибутов
    def compute_attrs300(variantid1, variantid2, n_samples=n_samples):
        attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
        attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
        
        cat1 = json.loads(attributes_df.at[variantid1, 'categories'])['2']
        
        # Получение 10 наиболее частых атрибутов для данной категории
        top_attributes = top_attributes_per_category.get(cat1, [])
        
        # Инициализация результатов
        result = [-2] * n_samples # Значение по умолчанию для отсутствующих атрибутов
        
        # Сравнение значений атрибутов
        for i, attr in enumerate(top_attributes):
            if attr in attrs1 or attr in attrs2:
                if attr in attrs1 and attr in attrs2:
                    vals1 = attrs1[attr]
                    vals2 = attrs2[attr]
                    if len(vals1) == 1 and len(vals2) == 1:
                        if bool(re.match(r'^[-+]?\d*\.?\d+([eE][-+]?\d+)?$', attrs1[attr][0])) and bool(re.match(r'^[-+]?\d*\.?\d+([eE][-+]?\d+)?$', attrs2[attr][0])):
                            if vals1 == vals2:
                                result[i] = 1 
                            else:
                                val1 = float(vals1[0])
                                val2 = float(vals2[0])
                                 # Используем абсолютное значение разности
                                abs_diff = abs(val1 - val2)
                                # Нормируем разницу, чтобы она находилась в диапазоне от 0 до 1
                                # В этом случае устанавливаем максимальное значение для нормализации.
                                max_diff = max(abs(val1), abs(val2))
                                result[i] = 1 - (abs_diff / (max_diff if max_diff != 0 else 1))
                                
                    else:
                        val1 = ' '.join(vals1)
                        val2 = ' '.join(vals2)
                        
                        # Токены без пунктуации
                        words1 = re.sub(r'[^\w\s]', ' ', val1) # Unicode токены
                        words1 = words1.replace('_', ' ') # Удаление _ символов
                        words1 = re.sub(r'\s+', ' ', words1) # Удаление лишних пробелов
                        tokens1 = set(words1.strip().split()) # Удаление пробелов в начале и конце    
                        #
                        words2 = re.sub(r'[^\w\s]', ' ', val2) # Unicode токены
                        words2 = words2.replace('_', ' ') # Удаление _ символов
                        words2 = re.sub(r'\s+', ' ', words2) # Удаление лишних пробелов
                        tokens2 = set(words2.strip().split()) # Удаление пробелов в начале и конце    
                        
                        if len(tokens1) == 0  or len(tokens2) == 0 :
                            if len(tokens1) == 0 and len(tokens2) == 0:
                                result[i] = -4
                            else:
                                result[i] = -3
                        
                        intersection = len(tokens1.intersection(tokens2))
                        union = len(tokens1.union(tokens2))
                        
                        result[i] = intersection / union if union > 0 else 0
                else:
                    result[i] = -1
        
        return result
    
    tqdm.pandas(desc="Processing test_df")
    test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
        test_df.progress_apply(lambda row: compute_attrs300(row['variantid1'], row['variantid2']), axis=1).tolist(),
        index=test_df.index
    )

    return test_df

# Применение функции к данным
test = attrs300(test, attributes)

Processing test_df: 100%|██████████| 49620/49620 [00:03<00:00, 16346.04it/s]
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_samples + 1)]] = pd.DataFrame(
  test_df[[f'attr{i}' for i in range(1, n_sampl

In [41]:
final_hgb = joblib.load('final_hgb_model.pkl')
hgb_preds = final_hgb.predict_proba(test.drop(columns=['variantid1', 'variantid2']))[:, 1]
submission = pd.DataFrame({
    'variantid1': test['variantid1'],
    'variantid2': test['variantid2'],
    'target': hgb_preds
})
submission.to_csv('submission.csv', index=False)