In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import json
import Levenshtein
from scipy.stats import entropy
import re
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict

In [2]:
dir = 'C:/Users/Николай/PycharmProjects/OZON E-CUP/competition/'

train = pd.read_parquet(f'{dir}train.parquet', engine='pyarrow')

test = pd.read_parquet(f'{dir}test.parquet', engine='pyarrow')

attributes = pd.read_parquet(f'{dir}attributes.parquet', engine='pyarrow')
attributes.set_index('variantid', inplace=True)

text = pd.read_parquet(f'{dir}text_and_bert.parquet', engine='pyarrow')
text.set_index('variantid', inplace=True)

resnet = pd.read_parquet(f'{dir}resnet.parquet', engine='pyarrow')
resnet.set_index('variantid', inplace=True)

In [3]:
# Фильтруем пары, где target == 1 (только матчи)
matched_pairs = train[train['target'] == 1]

# Создаем словарь, где ключ — это товар, а значение — это список "соседей" (пар с target == 1)
neighbor_dict = defaultdict(set)

# Заполняем словарь соседями для каждого товара
for _, row in tqdm(matched_pairs.iterrows(), total=len(matched_pairs)):
    v1, v2 = row['variantid1'], row['variantid2']
    neighbor_dict[v1].add(v2)
    neighbor_dict[v2].add(v1)

# Создаем множество существующих пар для быстрой проверки
existing_pairs = set(tuple(sorted([row['variantid1'], row['variantid2']])) for _, row in train.iterrows())

# Собираем новые пары (v1, v3), которых нет в train
new_pairs = set()
for v1 in tqdm(neighbor_dict, total=len(neighbor_dict)):
    # Для каждого соседа v2 товара v1
    for v2 in neighbor_dict[v1]:
        # Проверяем соседей v3 для v2 (они будут потенциальными v3 для v1)
        for v3 in neighbor_dict[v2]:
            # Исключаем повторение и пары вида (v1, v1)
            if v3 != v1:
                # Сортируем для уникальности (чтобы (v1, v3) и (v3, v1) считались одной и той же парой)
                new_pair = tuple(sorted([v1, v3]))
                # Добавляем пару, если ее еще нет в существующих
                if new_pair not in existing_pairs:
                    new_pairs.add(new_pair)
print(f"Количество новых пар: {len(new_pairs)}")
# Создаем DataFrame из новых пар
new_pairs = pd.DataFrame(new_pairs, columns=['variantid1', 'variantid2'])

# Добавляем колонку target и присваиваем ей значение 1 (так как это новые матчи)
new_pairs['target'] = 1

# Выводим первые несколько строк DataFrame
print(new_pairs.head())
# Приводим колонки variantid1 и variantid2 к типу int
new_pairs['variantid1'] = new_pairs['variantid1'].astype(int)
new_pairs['variantid2'] = new_pairs['variantid2'].astype(int)
train = pd.concat([train, new_pairs], ignore_index=True)
train.reset_index(inplace=True, drop=True)

100%|██████████| 561848/561848 [00:10<00:00, 55430.94it/s]
100%|██████████| 1041973/1041973 [00:00<00:00, 1275364.14it/s]


Количество новых пар: 87276
   variantid1  variantid2  target
0  1042371867  1299689027       1
1   105867545   376138947       1
2   653782378   690190682       1
3  1417344650  1477053699       1
4   191644435   489264789       1


In [4]:
print('len train:', len(train))
merged_df = train.merge(train, left_on=['variantid1', 'variantid2'], right_on=['variantid2', 'variantid1'], suffixes=('_left', '_right'))

wrongs = merged_df[merged_df['target_left'] != merged_df['target_right']]
duplicates = merged_df[merged_df['target_left'] == merged_df['target_right']]
duplicates = duplicates[duplicates['variantid1_left'] > duplicates['variantid2_left']].index

print('len wrongs:', len(wrongs))
train.drop(wrongs.index, inplace=True)
train.reset_index(drop=True, inplace=True)

print('len duplicates:', len(duplicates))
train.drop(duplicates, inplace=True)
train.reset_index(drop=True, inplace=True)

print('len train:', len(train))

del wrongs, duplicates, merged_df

len train: 1255792
len wrongs: 786
len duplicates: 4334
len train: 1250672


In [5]:
cats = attributes['categories'].tolist()

unique_strings = set()
for cat in cats:
    unique_strings.add(json.loads(cat)['2'])
unique_cats2 = {string: idx for idx, string in enumerate(set(unique_strings))}

unique_strings = set()
for cat in cats:
    unique_strings.add(json.loads(cat)['3'])
unique_cats3 = {string: idx for idx, string in enumerate(set(unique_strings))}

In [6]:
common_set = set()
t0 = {}
t1 = {}
for index, row in tqdm(train.iterrows(), total=len(train)):
        
    v1 = row['variantid1']
    v2 = row['variantid2']

    attr1 = json.loads(attributes.loc[v1, 'characteristic_attributes_mapping'])
    attr2 = json.loads(attributes.loc[v2, 'characteristic_attributes_mapping'])
    
    common_attrs = set(attr1.keys()).intersection(set(attr2.keys()))
    
    target = row['target']
    
    if target == 1:
        for key in common_attrs:
            if set(attr1[key]) == set(attr2[key]):
                if key in t1:
                    t1[key][0] += 1
                else:
                    t1[key] = [1, 0]
            else:
                if key in t1:
                    t1[key][1] += 1
                else:
                    t1[key] = [0, 1]

    if target == 0:
        for key in common_attrs:
            if set(attr1[key]) == set(attr2[key]):
                if key in t0:
                    t0[key][0] += 1
                else:
                    t0[key] = [1, 0]
            else:
                if key in t0:
                    t0[key][1] += 1
                else:
                    t0[key] = [0, 1]
                    
vital_keys = []
minor_keys = []
for key in t1:
    if key in t0:
        if t1[key][0] > t1[key][1] and t0[key][0] < t0[key][1] or t1[key][0] < t1[key][1] and t0[key][0] > t0[key][1]:
            vital_keys.append(key)
        else:
            minor_keys.append(key)

100%|██████████| 1250672/1250672 [01:25<00:00, 14642.64it/s]


In [7]:
# categories - attributes / cat - attrs
# name - description / n - d
# main_pics - extra - pics / m - e
CONFIG = {
    'text_process': True, # Составление name, description только для - слов (удаление пунктуации) / русских слов / английских слов / слов содержащих русские символы / слов содержащих английские символы / кобминации цифр 
    
    # attributes
    'cat_fit': True, # Количество одинаковых категорий / 4 
    'cat2': True, # Категориальный признак 2 категории
    'cat3': True, # Категориальный признак 3 категории
    'jac_attrs': True, # Сходство Жаккара для аттрибутов (только ключей) 
    'jac_vals': True, # Среднее сходств Жаккара для значений (для общих ключей)
    'jac_num_vals': True, # Сходство Жаккара для числовых общих значений
    'jac_sev_vals': True, # Сходство Жаккара для общих значений в ключах (значений в ключе > 1)
    'jac_vital_vals':True, # Сходство значений значимых ключей (по статистике выше)
    'jac_minor_vals':True, # Сходство незначений значимых ключей (по статистике выше)
    'diff_attrs': True, # Разность количеств аттрибутов (ключей) / max(len(attrs1), len(attrs2))
    
    # text
    'n_len_diff': True, # Разность количеств символов имен / max(len(name1), len(name2))
    'd_len_diff': True, # Разность количеств символов описаний / max(len(desc1), len(desc2))
    'n_lev': True, # Расстояние Левенштейна между именами / max(len(name1), len(name2))
    'd_lev': True, # Расстояние Левенштейна между описаниями / max(len(desc1), len(desc2))
    'n_jac_symbs': True, # Жаккарово сравнение символов имен
    'd_jac_symbs': True, # Жаккарово сравнение символов описаний
    
    'n_jac': True, # Жаккарово сравнение имен по 6 вариантам токенизации
    'd_jac': True, # Жаккарово сравнение описаний по 6 вариантам токенизации
    'n_lev_opers': True, # Расстояние Левенштейна между именами / max(len(name1), len(name2))
    'd_lev_opers': True, # Расстояние Левенштейна между описаниями / max(len(desc1), len(desc2))
    
    # resnet
    'm_cos': True, # Косинусное сходство между основными эмбеддингами
    'm_evklid': True, # Евклидово расстояние между основными эмбеддингами
    'e_jac': True, # Жаккарово сравнение дополнительных эмбеддингов
    'e_diff': True, # Разница в количестве дополнительных эмбеддингов
    'm_ent_diff': True, # Энтропия (не знаю что за функция) основных эмбеддингов
    'e_avg_cos': True, # Косинусное сходство между средним эмбеддингом дополнительных эмбеддингов

    'n_years': True,
    'd_years': True,
}

In [8]:
if CONFIG.get('text_process', False):
    def text_process(sentence):
        if sentence is None:
            return None, None, None, None
        # Общая обработка
        sentence = sentence.replace('\n', ' ').lower()
        
        # Токены без пунктуации
        words = re.sub(r'[^\w\s]', ' ', sentence) # Unicode токены
        words = words.replace('_', ' ') # Удаление _ символов
        words = re.sub(r'\s+', ' ', words) # Удаление лишних пробелов
        words = words.strip() # Удаление пробелов в начале и конце
        
        # Токенов, содержащих только русские буквы
        ru_words = re.sub(r'[^а-яА-Я]', ' ', sentence) # Получение токенов с ру символами
        ru_words = re.sub(r'\s+', ' ', ru_words) # Удаление лишних пробелов
        ru_words = ru_words.strip() # Удаление пробелов в начале и конце
        
        # Токенов, содержащих только английские буквы
        en_words = re.sub(r'[^a-zA-Z]', ' ', sentence) # Получение токенов с ру символами
        en_words = re.sub(r'\s+', ' ', en_words) # Удаление лишних пробелов
        en_words = en_words.strip() # Удаление пробелов в начале и конце
        
        # Токены, где содержатся ру символы (токены не состоят целиком из ру символов - testовое)
        ru_comb_words = re.sub(r'[^\w\s]', ' ', sentence) # Unicode токены
        ru_comb_words = ru_comb_words.replace('_', ' ') # Удаление _ символов
        ru_comb_words = ' '.join(re.findall(r'\b(?=\w*[а-яА-Я])(?=\w*[^\Wа-яА-Я])\w+\b', ru_comb_words))
        ru_comb_words = re.sub(r'\s+', ' ', ru_comb_words) # Удаление лишних пробелов
        ru_comb_words = ru_comb_words.strip() # Удаление пробелов в начале и конце
        
        # Токены, где содержатся en символы (токены не состоят целиком из en символов - testовое)
        en_comb_words = re.sub(r'[^\w\s]', ' ', sentence) # Unicode токены
        en_comb_words = en_comb_words.replace('_', ' ') # Удаление _ символов
        en_comb_words = ' '.join(re.findall(r'\b(?=\w*[a-zA-Z])(?=\w*[^\Wa-zA-Z])\w+\b', en_comb_words))
        en_comb_words = re.sub(r'\s+', ' ', en_comb_words) # Удаление лишних пробелов
        en_comb_words = en_comb_words.strip() # Удаление пробелов в начале и конце
         
        # Токены в виде цифр
        numbers = re.sub(r'[^\d]', ' ', sentence) # Получение токенов с цифрами
        numbers = re.sub(r'\s+', ' ', numbers) # Удаление лишних пробелов
        numbers = numbers.strip() # Удаление пробелов в начале и конце

        return words, ru_words, en_words, ru_comb_words, en_comb_words, numbers
    
    # Применение функции и распаковка значений в 4 отдельные колонки для name
    tqdm.pandas(desc='Processing name')
    text[['n_words', 'n_ru_words', 'n_en_words', 'n_ru_comb_words', 'n_en_comb_words', 'n_numbers']] = text['name'].progress_apply(
        lambda x: pd.Series(text_process(x)))
    
    # Применение функции и распаковка значений в 4 отдельные колонки для description
    tqdm.pandas(desc='Processing description')
    text[['d_words', 'd_ru_words', 'd_en_words', 'd_ru_comb_words', 'd_en_comb_words', 'd_numbers']] = text['description'].progress_apply(
        lambda x: pd.Series(text_process(x)))

Processing name: 100%|██████████| 2252569/2252569 [03:01<00:00, 12392.84it/s]
Processing description: 100%|██████████| 2252569/2252569 [14:03<00:00, 2669.55it/s] 


In [9]:
if CONFIG.get('cat_fit', False):
    def cat_fit(train_df, test_df, attributes_df):
        
        def compute_cat_fit(variantid1, variantid2):
            
            cat1 = json.loads(attributes_df.at[variantid1, 'categories'])
            cat2 = json.loads(attributes_df.at[variantid2, 'categories'])
                
            common_keys = set(cat1.keys()) & set(cat2.keys())
            
            return (sum(1 for key in common_keys if cat1[key] == cat2[key])) / len(common_keys)
            
        tqdm.pandas(desc="Processing train_df")
        train_df['cat_fit'] = train_df.progress_apply(lambda row: compute_cat_fit(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['cat_fit'] = test_df.progress_apply(lambda row: compute_cat_fit(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df
    
    train, test = cat_fit(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [00:20<00:00, 62067.78it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 61795.99it/s]


In [10]:
if CONFIG.get('cat2', False):
    def cat2(train_df, test_df, attributes_df):
        
        def compute_cat2(variantid1, variantid2, u_cats=unique_cats2):
            
            cat1 = json.loads(attributes_df.at[variantid1, 'categories'])
            cat2 = json.loads(attributes_df.at[variantid2, 'categories'])
            
            if cat1['2'] == cat2['2']:
                if cat1['2'] in u_cats:
                    return u_cats[cat1['2']]
                else:
                    return -2
            else:
                return -1
            
        tqdm.pandas(desc="Processing train_df")
        train_df['cat2'] = train_df.progress_apply(lambda row: compute_cat2(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['cat2'] = test_df.progress_apply(lambda row: compute_cat2(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df

    train, test = cat2(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [00:18<00:00, 68873.15it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 65723.84it/s]


In [11]:
if CONFIG.get('cat3', False):
    def cat3(train_df, test_df, attributes_df):
        
        def compute_cat3(variantid1, variantid2, u_cats=unique_cats3):
            
            cat1 = json.loads(attributes_df.at[variantid1, 'categories'])
            cat2 = json.loads(attributes_df.at[variantid2, 'categories'])
            
            if cat1['3'] == cat2['3']:
                if cat1['3'] in u_cats:
                    return u_cats[cat1['3']]
                else:
                    return -2
            else:
                return -1
            
        tqdm.pandas(desc="Processing train_df")
        train_df['cat3'] = train_df.progress_apply(lambda row: compute_cat3(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['cat3'] = test_df.progress_apply(lambda row: compute_cat3(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df

    train, test = cat3(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [00:18<00:00, 68402.19it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 65289.37it/s]


In [12]:
if CONFIG.get('jac_attrs', False):
    def jac_attrs(train_df, test_df, attributes_df):
        
        def compute_jac_attrs(variantid1, variantid2):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            keys1 = set(attrs1.keys())
            keys2 = set(attrs2.keys())
            
            if len(keys1) == 0 or len(keys2) == 0:
                if len(keys1) == 0 and len(keys2) == 0:
                    return -2
                else:
                    return -1
                
            intersection = len(keys1.intersection(keys2))
            union = len(keys1.union(keys2))
            
            return intersection / union if union != 0 else 0

        tqdm.pandas(desc="Processing train_df")
        train_df['jac_attrs'] = train_df.progress_apply(lambda row: compute_jac_attrs(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['jac_attrs'] = test_df.progress_apply(lambda row: compute_jac_attrs(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df

    train, test = jac_attrs(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [01:18<00:00, 15898.27it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 34220.67it/s]


In [13]:
if CONFIG.get('jac_vals', False):
    def jac_vals(train_df, test_df, attributes_df):
        
        def compute_jac_vals(variantid1, variantid2):
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))

            if not common_keys:
                return -2  

            jaccard_scores = []

            for key in common_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing train_df")
        train_df['jac_vals'] = train_df.progress_apply(lambda row: compute_jac_vals(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['jac_vals'] = test_df.progress_apply(lambda row: compute_jac_vals(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df

    train, test = jac_vals(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [00:41<00:00, 30025.25it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 30993.15it/s]


In [14]:
if CONFIG.get('jac_num_vals', False):
    def jac_num_vals(train_df, test_df, attributes_df):
        
        def compute_jac_num_vals(variantid1, variantid2):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))
            
            if not common_keys:
                return -2  
            
            num_common_keys = set()
            
            for key in common_keys:
                if len(attrs1[key]) == len(attrs2[key]):
                    digit = True
                    for val1, val2 in zip(attrs1[key], attrs2[key]):
                        if val1.isdigit() and val2.isdigit():
                            continue
                        else:
                            digit = False
                    if digit:
                        num_common_keys.add(key)
            
            if len(num_common_keys) == 0:
                return -3
            
            jaccard_scores = []

            for key in num_common_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing train_df")
        train_df['jac_num_vals'] = train_df.progress_apply(
            lambda row: compute_jac_num_vals(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['jac_num_vals'] = test_df.progress_apply(
            lambda row: compute_jac_num_vals(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df

    train, test = jac_num_vals(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [00:39<00:00, 31940.62it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 30404.40it/s]


In [15]:
if CONFIG.get('jac_sev_vals', False):
    def jac_sev_vals(train_df, test_df, attributes_df):
        
        def compute_jac_sev_vals(variantid1, variantid2):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))
            
            if not common_keys:
                return -2  
            
            sev_common_keys = set()
            
            for key in common_keys:
                if len(attrs1[key]) > 1 or len(attrs2[key]) > 1:
                        sev_common_keys.add(key)
            
            if len(sev_common_keys) == 0:
                return -3
            
            jaccard_scores = []

            for key in sev_common_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing train_df")
        train_df['jac_sev_vals'] = train_df.progress_apply(
            lambda row: compute_jac_sev_vals(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['jac_sev_vals'] = test_df.progress_apply(
            lambda row: compute_jac_sev_vals(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df

    train, test = jac_sev_vals(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [00:34<00:00, 36377.87it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 37196.39it/s]


In [16]:
if CONFIG.get('jac_vital_vals', False):
    def jac_vital_vals(train_df, test_df, attributes_df):
        
        def compute_jac_vital_vals(variantid1, variantid2, v_keys=vital_keys):
            
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))

            if not common_keys:
                return -2  
            
            common_vital_keys = set()
            
            for key in common_keys:
                if key in v_keys:
                    common_vital_keys.add(key)
            
            if len(common_vital_keys) == 0:
                return -3
                    
            jaccard_scores = []

            for key in common_vital_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing train_df")
        train_df['jac_vital_vals'] = train_df.progress_apply(
            lambda row: compute_jac_vital_vals(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['jac_vital_vals'] = test_df.progress_apply(
            lambda row: compute_jac_vital_vals(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df

    train, test = jac_vital_vals(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [01:33<00:00, 13321.86it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:03<00:00, 14750.54it/s]


In [17]:
if CONFIG.get('jac_minor_vals', False):
    def jac_minor_vals(train_df, test_df, attributes_df):
        
        def compute_jac_minor_vals(variantid1, variantid2, m_keys=minor_keys):
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            
            common_keys = set(attrs1.keys()).intersection(set(attrs2.keys()))

            if not common_keys:
                return -2  
            
            common_minor_keys = set()
            
            for key in common_keys:
                if key in m_keys:
                    common_minor_keys.add(key)
            
            if len(common_minor_keys) == 0:
                return -3
                    
            jaccard_scores = []

            for key in common_minor_keys:
                set_values1 = set(attrs1[key])
                set_values2 = set(attrs2[key])
    
                intersection = len(set_values1.intersection(set_values2))
                union = len(set_values1.union(set_values2))
                
                jaccard_score = intersection / union if union != 0 else 0
                jaccard_scores.append(jaccard_score)
            
            return sum(jaccard_scores) / len(jaccard_scores)

        tqdm.pandas(desc="Processing train_df")
        train_df['jac_minor_vals'] = train_df.progress_apply(lambda row: compute_jac_minor_vals(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['jac_minor_vals'] = test_df.progress_apply(lambda row: compute_jac_minor_vals(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df

    train, test = jac_minor_vals(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [01:57<00:00, 10679.90it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:04<00:00, 11876.24it/s]


In [18]:
if CONFIG.get('diff_attrs', False):
    def diff_attrs(train_df, test_df, attributes_df):
        
        def compute_diff_attrs(variantid1, variantid2):
            attrs1 = json.loads(attributes_df.at[variantid1, 'characteristic_attributes_mapping'])
            attrs2 = json.loads(attributes_df.at[variantid2, 'characteristic_attributes_mapping'])
            
            return (abs(len(attrs1) - len(attrs2))) / max(len(attrs1), len(attrs2))
        
        tqdm.pandas(desc="Processing train_df")
        train_df['diff_attrs'] = train_df.progress_apply(lambda row: compute_diff_attrs(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['diff_attrs'] = test_df.progress_apply(lambda row: compute_diff_attrs(row['variantid1'], row['variantid2']), axis=1)

        return train_df, test_df
    
    train, test = diff_attrs(train, test, attributes)

Processing train_df: 100%|██████████| 1250672/1250672 [00:28<00:00, 44495.14it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 45611.05it/s]


In [19]:
if CONFIG.get('n_len_diff', False):
    def n_len_diff(train_df, test_df, text_df):
        
        def compute_n_len_diff(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']
                
            if len(name1.replace(' ', '')) == 0  or len(name2.replace(' ', '')) == 0:
                if len(name1.replace(' ', '')) == 0  and len(name2.replace(' ', '')) == 0:
                    return -2
                else:
                    return -1
            
            return abs(len(name1) - len(name2)) / max(len(name1), len(name2))

        tqdm.pandas(desc="Processing train_df")
        train_df['n_len_diff'] = train_df.progress_apply(lambda row: compute_n_len_diff(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['n_len_diff'] = test_df.progress_apply(lambda row: compute_n_len_diff(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df

    train, test = n_len_diff(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [00:17<00:00, 73447.58it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 80814.32it/s]


In [20]:
if CONFIG.get('d_len_diff', False):
    def d_len_diff(train_df, test_df, text_df):
        
        def compute_d_len_diff(variantid1, variantid2):
            
            desc1 = text_df.at[variantid1, 'description']
            desc2 = text_df.at[variantid2, 'description']
            
            if desc1 is None or desc2 is None:
                if desc1 is None and desc2 is None:
                    return -2
                else:
                    return -1
                
            if len(desc1.replace(' ', '')) == 0  or len(desc2.replace(' ', '')) == 0:
                if len(desc1.replace(' ', '')) == 0  and len(desc2.replace(' ', '')) == 0:
                    return -2
                else:
                    return -1
                
            return abs(len(desc1) - len(desc2)) / max(len(desc1), len(desc2))

        tqdm.pandas(desc="Processing train_df")
        train_df['d_len_diff'] = train_df.progress_apply(lambda row: compute_d_len_diff(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['d_len_diff'] = test_df.progress_apply(lambda row: compute_d_len_diff(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df

    train, test = d_len_diff(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [00:51<00:00, 24401.03it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 55885.16it/s]


In [21]:
if CONFIG.get('n_lev', False):
    def n_lev(train_df, test_df, text_df):
        
        def compute_n_lev(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']

            if len(name1.replace(' ', '')) == 0  or len(name2.replace(' ', '')) == 0:
                if len(name1.replace(' ', '')) == 0  and len(name2.replace(' ', '')) == 0:
                    return -2
                else:
                    return -1
                
            return Levenshtein.distance(name1, name2) / max(len(name1), len(name2))
    
        tqdm.pandas(desc="Processing train_df")
        train_df['n_lev'] = train_df.progress_apply(lambda row: compute_n_lev(row['variantid1'], row['variantid2']), axis=1)
    
        tqdm.pandas(desc="Processing test_df")
        test_df['n_lev'] = test_df.progress_apply(lambda row: compute_n_lev(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
    
    train, test = n_lev(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [00:16<00:00, 76907.16it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 74853.62it/s]


In [22]:
if CONFIG.get('d_lev', False):
    def d_lev(train_df, test_df, text_df):
        
        def compute_d_lev(variantid1, variantid2):
            
            desc1 = text_df.at[variantid1, 'description']
            desc2 = text_df.at[variantid2, 'description']
            
            if desc1 is None or desc2 is None:
                if desc1 is None and desc2 is None:
                    return -2
                else:
                    return -1
            
            if len(desc1.replace(' ', '')) == 0  or len(desc2.replace(' ', '')) == 0:
                if len(desc1.replace(' ', '')) == 0  and len(desc2.replace(' ', '')) == 0:
                    return -2
                else:
                    return -1
                
            return Levenshtein.distance(desc1, desc2) / max(len(desc1), len(desc2))
    
        tqdm.pandas(desc="Processing train_df")
        train_df['d_lev'] = train_df.progress_apply(lambda row: compute_d_lev(row['variantid1'], row['variantid2']), axis=1)
    
        tqdm.pandas(desc="Processing test_df")
        test_df['d_lev'] = test_df.progress_apply(lambda row: compute_d_lev(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
    
    train, test = d_lev(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [01:06<00:00, 18757.73it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:04<00:00, 10095.42it/s]


In [23]:
if CONFIG.get('n_jac_symbs', False):
    def n_jac_symbs(train_df, test_df, text_df):
        
        def compute_n_jac_symbs(variantid1, variantid2):
            
            symbols1 = set(text_df.at[variantid1, 'name'])
            symbols2 = set(text_df.at[variantid2, 'name'])
            
            if len(symbols1) == 0 or len(symbols2) == 0:
                if len(symbols1) == 0 and len(symbols2) == 0:
                    return -2
                else:
                    return -1
            
            intersection = len(symbols1.intersection(symbols2))
            union = len(symbols1.union(symbols2))
            
            return intersection / union if union != 0 else 0
        
        tqdm.pandas(desc="Processing train_df")
        train_df['n_jac_symbs'] = train_df.progress_apply(lambda row: compute_n_jac_symbs(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['n_jac_symbs'] = test_df.progress_apply(lambda row: compute_n_jac_symbs(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
    
    train, test = n_jac_symbs(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [00:38<00:00, 32578.91it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 26252.62it/s]


In [24]:
if CONFIG.get('d_jac_symbs', False):
    def d_jac_symbs(train_df, test_df, text_df):
        
        def compute_d_jac_symbs(variantid1, variantid2):
            
            desc1 = text_df.at[variantid1, 'description']
            desc2 = text_df.at[variantid2, 'description']
            
            if desc1 is None or desc2 is None:
                if desc1 is None and desc2 is None:
                    return -2
                else:
                    return -1
            
            if len(desc1) == 0 or len(desc2) == 0:
                if len(desc1) == 0 and len(desc2) == 0: 
                    return -2
                else:
                    return -1
            
            
            symbols1 = set(desc1)
            symbols2 = set(desc2)
            
            intersection = len(symbols1.intersection(symbols2))
            union = len(symbols1.union(symbols2))
            
            return intersection / union if union != 0 else 0
        
        tqdm.pandas(desc="Processing train_df")
        train_df['d_jac_symbs'] = train_df.progress_apply(lambda row: compute_d_jac_symbs(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['d_jac_symbs'] = test_df.progress_apply(lambda row: compute_d_jac_symbs(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
    
    train, test = d_jac_symbs(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [02:04<00:00, 10053.55it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:04<00:00, 10307.43it/s]


In [25]:
if CONFIG.get('n_jac', False):
    def n_jac(train_df, test_df, text_df):
        
        def compute_jac_value(tokens1, tokens2):
            
            if len(tokens1) == 0  or len(tokens2) == 0 :
                if len(tokens1) == 0  and len(tokens2) == 0 :
                    return -2
                else:
                    return -1
            
            intersection = len(tokens1.intersection(tokens2))
            union = len(tokens1.union(tokens2))
            
            return intersection / union if union > 0 else 0
        
        def compute_n_jac(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']
            
            if len(name1.replace(' ', '')) == 0  or len(name2.replace(' ', '')) == 0 :
                if len(name1.replace(' ', '')) == 0  and len(name2.replace(' ', '')) == 0 :
                    return -2, -2, -2, -2, -2, -2
                else:
                    return -1, -1, -1 , -1, -1, -1
                
            words1 = set(text_df.at[variantid1, 'n_words'].split())
            words2 = set(text_df.at[variantid2, 'n_words'].split())
            n_jac_words = compute_jac_value(words1, words2)
            
            ru_words1 = set(text_df.at[variantid1, 'n_ru_words'].split())
            ru_words2 = set(text_df.at[variantid2, 'n_ru_words'].split())
            n_jac_ru_words = compute_jac_value(ru_words1, ru_words2)
            
            en_words1 = set(text_df.at[variantid1, 'n_en_words'].split())
            en_words2 = set(text_df.at[variantid2, 'n_en_words'].split())
            n_jac_en_words = compute_jac_value(en_words1, en_words2)
            
            ru_comb_words1 = set(text_df.at[variantid1, 'n_ru_comb_words'].split())
            ru_comb_words2 = set(text_df.at[variantid2, 'n_ru_comb_words'].split())
            n_jac_ru_comb_words = compute_jac_value(ru_comb_words1, ru_comb_words2)
            
            en_comb_words1 = set(text_df.at[variantid1, 'n_en_comb_words'].split())
            en_comb_words2 = set(text_df.at[variantid2, 'n_en_comb_words'].split())
            n_jac_en_comb_words = compute_jac_value(en_comb_words1, en_comb_words2)
            
            numbers1 = set(text_df.at[variantid1, 'n_numbers'].split())
            numbers2 = set(text_df.at[variantid2, 'n_numbers'].split())
            n_jac_numbers = compute_jac_value(numbers1, numbers2)
        
            return n_jac_words, n_jac_ru_words, n_jac_en_words, n_jac_ru_comb_words, n_jac_en_comb_words, n_jac_numbers
    
        tqdm.pandas(desc='Processing train')
        train_df[['n_jac_words', 'n_jac_ru_words', 'n_jac_en_words', 'n_jac_ru_comb_words', 'n_jac_en_comb_words', 'n_jac_numbers']] = train_df.progress_apply(lambda row: pd.Series(compute_n_jac(row['variantid1'], row['variantid2'])), axis=1)
        
        tqdm.pandas(desc='Processing test')
        test_df[['n_jac_words', 'n_jac_ru_words', 'n_jac_en_words', 'n_jac_ru_comb_words', 'n_jac_en_comb_words', 'n_jac_numbers']] = test_df.progress_apply(lambda row: pd.Series(compute_n_jac(row['variantid1'], row['variantid2'])), axis=1)
        
        return train_df, test_df
    
    train, test = n_jac(train, test, text)

Processing train: 100%|██████████| 1250672/1250672 [04:55<00:00, 4238.61it/s] 
Processing test: 100%|██████████| 49620/49620 [00:05<00:00, 9394.32it/s] 


In [26]:
if CONFIG.get('d_jac', False):
    def d_jac(train_df, test_df, text_df):
        
        def compute_jac_value(tokens1, tokens2):
            
            if len(tokens1) == 0  or len(tokens2) == 0 :
                if len(tokens1) == 0  and len(tokens2) == 0 :
                    return -2
                else:
                    return -1
                
            intersection = len(tokens1.intersection(tokens2))
            union = len(tokens1.union(tokens2))
            
            return intersection / union if union > 0 else 0
        
        def compute_d_jac(variantid1, variantid2):
            
            desc1 = text_df.at[variantid1, 'description']
            desc2 = text_df.at[variantid2, 'description']
            
            if desc1 is None or desc2 is None:
                if desc1 is None and desc2 is None:
                    return -2, -2, -2, -2, -2, -2
                else:
                    return -1, -1, -1 , -1, -1, -1
            
            if len(desc1) == 0  or len(desc2) == 0 :
                if len(desc1) == 0  and len(desc2) == 0:
                    return -2, -2, -2, -2, -2, -2
                else:
                    return -1, -1, -1 , -1, -1, -1
                
            words1 = set(text_df.at[variantid1, 'd_words'].split())
            words2 = set(text_df.at[variantid2, 'd_words'].split())
            d_jac_words = compute_jac_value(words1, words2)
            
            ru_words1 = set(text_df.at[variantid1, 'd_ru_words'].split())
            ru_words2 = set(text_df.at[variantid2, 'd_ru_words'].split())
            d_jac_ru_words = compute_jac_value(ru_words1, ru_words2)
            
            en_words1 = set(text_df.at[variantid1, 'd_en_words'].split())
            en_words2 = set(text_df.at[variantid2, 'd_en_words'].split())
            d_jac_en_words = compute_jac_value(en_words1, en_words2)
            
            ru_comb_words1 = set(text_df.at[variantid1, 'd_ru_comb_words'].split())
            ru_comb_words2 = set(text_df.at[variantid2, 'd_ru_comb_words'].split())
            d_jac_ru_comb_words = compute_jac_value(ru_comb_words1, ru_comb_words2)
            
            en_comb_words1 = set(text_df.at[variantid1, 'd_en_comb_words'].split())
            en_comb_words2 = set(text_df.at[variantid2, 'd_en_comb_words'].split())
            d_jac_en_comb_words = compute_jac_value(en_comb_words1, en_comb_words2)
            
            numbers1 = set(text_df.at[variantid1, 'd_numbers'].split())
            numbers2 = set(text_df.at[variantid2, 'd_numbers'].split())
            d_jac_numbers = compute_jac_value(numbers1, numbers2)
        
            return d_jac_words, d_jac_ru_words, d_jac_en_words, d_jac_ru_comb_words, d_jac_en_comb_words, d_jac_numbers
    
        tqdm.pandas(desc='Processing train')
        train_df[['d_jac_words', 'd_jac_ru_words', 'd_jac_en_words', 'd_jac_ru_comb_words', 'd_jac_en_comb_words', 'd_jac_numbers']] = train_df.progress_apply(lambda row: pd.Series(compute_d_jac(row['variantid1'], row['variantid2'])), axis=1)
        
        tqdm.pandas(desc='Processing test')
        test_df[['d_jac_words', 'd_jac_ru_words', 'd_jac_en_words', 'd_jac_ru_comb_words', 'd_jac_en_comb_words', 'd_jac_numbers']] = test_df.progress_apply(lambda row: pd.Series(compute_d_jac(row['variantid1'], row['variantid2'])), axis=1)
        
        return train_df, test_df
    
    train, test = d_jac(train, test, text)

Processing train: 100%|██████████| 1250672/1250672 [06:19<00:00, 3298.43it/s]
Processing test: 100%|██████████| 49620/49620 [00:09<00:00, 5273.12it/s]


In [27]:
if CONFIG.get('n_lev_opers', False):
    def n_lev_opers(train_df, test_df, text_df):
        
        def compute_n_lev_opers(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']
            
            if len(name1.replace(' ', '')) == 0  or len(name2.replace(' ', '')) == 0:
                if len(name1.replace(' ', '')) == 0  and len(name2.replace(' ', '')) == 0:
                    return -2, -2
                else:
                    return -1, -1
                
            operations = Levenshtein.editops(name1, name2)
            
            insertions = sum(1 for op in operations if op[0] == 'insert')
            deletions = sum(1 for op in operations if op[0] == 'delete')
            replaces = sum(1 for op in operations if op[0] == 'replace')
            
            n_lev_var = insertions + deletions
            
            max_len = max(len(name1), len(name2))
            
            return n_lev_var / max_len, replaces / max_len
    
        tqdm.pandas(desc="Processing train_df")
        train_df[['n_lev_var', 'n_lev_rep']] = train_df.progress_apply(
            lambda row: pd.Series(compute_n_lev_opers(row['variantid1'], row['variantid2'])), axis=1)
    
        tqdm.pandas(desc="Processing test_df")
        test_df[['n_lev_var', 'n_lev_rep']] = test_df.progress_apply(
            lambda row: pd.Series(compute_n_lev_opers(row['variantid1'], row['variantid2'])), axis=1)
        
        return train_df, test_df
    
    train, test = n_lev_opers(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [01:39<00:00, 12557.76it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:03<00:00, 13977.48it/s]


In [28]:
if CONFIG.get('d_lev_opers', False):
    def d_lev_opers(train_df, test_df, text_df):
        
        def compute_d_lev_opers(variantid1, variantid2):
            
            desc1 = text_df.at[variantid1, 'description']
            desc2 = text_df.at[variantid2, 'description']
            
            if desc1 is None or desc2 is None:
                if desc1 is None and desc2 is None:
                    return -2, -2
                else:
                    return -1, -1
            
            if len(desc1.replace(' ', '')) == 0  or len(desc2.replace(' ', ''))  == 0:
                if len(desc1.replace(' ', '')) == 0  and len(desc2.replace(' ', ''))  == 0:
                    return -2, -2
                else:
                    return -1, -1
                
            operations = Levenshtein.editops(desc1, desc2)
            
            insertions = sum(1 for op in operations if op[0] == 'insert')
            deletions = sum(1 for op in operations if op[0] == 'delete')
            replaces = sum(1 for op in operations if op[0] == 'replace')
            
            d_lev_var = insertions + deletions
            
            max_len = max(len(desc1), len(desc2))
            
            return d_lev_var / max_len, replaces / max_len
    
        tqdm.pandas(desc="Processing train_df")
        train_df[['d_lev_var', 'd_lev_rep']] = train_df.progress_apply(
            lambda row: pd.Series(compute_d_lev_opers(row['variantid1'], row['variantid2'])), axis=1)
    
        tqdm.pandas(desc="Processing test_df")
        test_df[['d_lev_var', 'd_lev_rep']] = test_df.progress_apply(
            lambda row: pd.Series(compute_d_lev_opers(row['variantid1'], row['variantid2'])), axis=1)
        
        return train_df, test_df
    
    train, test = d_lev_opers(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [04:34<00:00, 4550.70it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:21<00:00, 2277.09it/s]


In [29]:
if CONFIG.get('m_cos', False):
    def m_cos(train_df, test_df, resnet_df):
        
        def compute_m_cos(variantid1, variantid2):
            
            embedding_1 = resnet_df.at[variantid1, 'main_pic_embeddings_resnet_v1'][0]
            embedding_2 = resnet_df.at[variantid2, 'main_pic_embeddings_resnet_v1'][0]
            
            return cosine_similarity([embedding_1], [embedding_2])[0,0]

        tqdm.pandas(desc="Processing train_df")
        train_df['m_cos'] = train_df.progress_apply(lambda row: compute_m_cos(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['m_cos'] = test_df.progress_apply(lambda row: compute_m_cos(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
        
    train, test = m_cos(train, test, resnet)

Processing train_df: 100%|██████████| 1250672/1250672 [08:22<00:00, 2489.12it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:20<00:00, 2384.55it/s]


In [30]:
if CONFIG.get('m_evklid', False):
    def m_evklid(train_df, test_df, resnet_df):
        
        def compute_m_evklid(variantid1, variantid2):
            
            embedding_1 = resnet_df.at[variantid1, 'main_pic_embeddings_resnet_v1'][0]
            embedding_2 = resnet_df.at[variantid2, 'main_pic_embeddings_resnet_v1'][0]
            
            return euclidean_distances([embedding_1], [embedding_2])[0, 0]

        tqdm.pandas(desc="Processing train_df")
        train_df['m_evklid'] = train_df.progress_apply(lambda row: compute_m_evklid(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['m_evklid'] = test_df.progress_apply(lambda row: compute_m_evklid(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
        
    train, test = m_evklid(train, test, resnet)

Processing train_df: 100%|██████████| 1250672/1250672 [05:39<00:00, 3686.86it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:14<00:00, 3392.58it/s]


In [31]:
if CONFIG.get('e_jac', False):
    def e_jac(train_df, test_df, resnet_df):
        
        def calculate_e_jac(v1, v2):
            embs1 = resnet_df.at[v1, 'pic_embeddings_resnet_v1']
            embs2 = resnet_df.at[v2, 'pic_embeddings_resnet_v1'] 
            
            if embs1 is None or embs2 is None:
                if embs1 is None and embs2 is None:
                    return -2
                else:
                    return -1
            
            set1 = set(tuple(emb) for emb in embs1)
            set2 = set(tuple(emb) for emb in embs2)
            
            intersection = set1.intersection(set2)
            union = set1.union(set2)
            
            if not union:
                return 0

            return len(intersection) / len(union)

        tqdm.pandas(desc="Processing train_df")
        train_df['e_jac'] = train_df.progress_apply(lambda row: calculate_e_jac(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['e_jac'] = test_df.progress_apply(lambda row: calculate_e_jac(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
        
    train, test = e_jac(train, test, resnet)

Processing train_df: 100%|██████████| 1250672/1250672 [08:13<00:00, 2533.29it/s] 
Processing test_df: 100%|██████████| 49620/49620 [00:08<00:00, 6023.84it/s]


In [32]:
if CONFIG.get('e_diff', False):
    def e_diff(train_df, test_df, resnet_df):
        
        def count_e_diff(variantid1, variantid2):
            emb1 = resnet_df.at[variantid1, 'pic_embeddings_resnet_v1']
            emb2 = resnet_df.at[variantid2, 'pic_embeddings_resnet_v1']
            
            if emb1 is None or emb2 is None:
                if emb1 is None and emb2 is None:
                    return -2
                else:
                    return -1
            
            return abs(len(emb1) - len(emb2)) / max(len(emb1), len(emb2))

        tqdm.pandas(desc="Processing train_df")
        train_df['e_diff'] = train_df.progress_apply(lambda row: count_e_diff(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['e_diff'] = test_df.progress_apply(lambda row: count_e_diff(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
        
    train, test = e_diff(train, test, resnet)

Processing train_df: 100%|██████████| 1250672/1250672 [00:23<00:00, 53836.14it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:00<00:00, 79253.81it/s]


In [33]:
if CONFIG.get('m_ent_diff', False):
    def m_ent_diff(train_df, test_df, resnet_df):
        
        def compute_entropy_diff(embedding):
            
            norm_embedding = np.abs(embedding) / np.sum(np.abs(embedding))
            
            return entropy(norm_embedding)
    
        def compute_m_ent_diff(variantid1, variantid2):
            embedding1 = resnet_df.at[variantid1, 'main_pic_embeddings_resnet_v1'][0]
            embedding2 = resnet_df.at[variantid2, 'main_pic_embeddings_resnet_v1'][0]
    
            entropy1 = compute_entropy_diff(embedding1)
            entropy2 = compute_entropy_diff(embedding2)
            
            return abs(entropy1 - entropy2)
        
        tqdm.pandas(desc="Processing train_df")
        train_df['m_ent_diff'] = train_df.progress_apply(lambda row: compute_m_ent_diff(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['m_ent_diff'] = test_df.progress_apply(lambda row: compute_m_ent_diff(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
    
    train, test = m_ent_diff(train, test, resnet)

Processing train_df: 100%|██████████| 1250672/1250672 [06:42<00:00, 3109.11it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:16<00:00, 3076.62it/s]


In [34]:
if CONFIG.get('e_avg_cos', False):
    def e_avg_cos(train_df, test_df, resnet_df):
        
        def average_embedding(embeddings):
            return np.mean(embeddings, axis=0)

        def calculate_e_avg_cos(v1, v2):
            emb1 = resnet_df.at[v1, 'pic_embeddings_resnet_v1']
            emb2 = resnet_df.at[v2, 'pic_embeddings_resnet_v1'] 
            
            if emb1 is None or emb2 is None:
                if emb1 is None and emb2 is None:
                    return -2
                else:
                    return -1
            
            return cosine_similarity([average_embedding(emb1)], [average_embedding(emb2)])[0, 0]
        
        tqdm.pandas(desc="Processing train_df")
        train_df['e_avg_cos'] = train_df.progress_apply(lambda row: calculate_e_avg_cos(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['e_avg_cos'] = test_df.progress_apply(lambda row: calculate_e_avg_cos(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df
        
    train, test = e_avg_cos(train, test, resnet)

Processing train_df: 100%|██████████| 1250672/1250672 [03:19<00:00, 6269.16it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:07<00:00, 6401.43it/s]


In [35]:
wrong_indexes = []
for index, row in tqdm(train.iterrows(), total=len(train)):
    if row['target'] == 1:
        continue
    
    v1 = row['variantid1']
    v2 = row['variantid2']
    
    name1 = text.loc[v1, 'name']
    name2 = text.loc[v2, 'name']
    
    desc1 = text.loc[v1, 'description']
    desc2 = text.loc[v2, 'description']
    
    if desc1 is None:
        desc1 = ''
    if desc2 is None:
        desc2 = ''
        
    e = row['e_jac']
    m_cos = row['m_cos']
    jac_vals = row['jac_vals']
    
    if name1.replace(' ', '') == name2.replace(' ', '') or name1 == '' or name2 == '':
        if desc1.replace(' ', '') == desc2.replace(' ', '') or desc1 == '' or desc2 == '':
            if jac_vals == 1:
                if m_cos == 1:
                    if e == 1 or e == -1 or e == -2:
                        wrong_indexes.append(index)
                        
train.loc[wrong_indexes, 'target'] = 1

100%|██████████| 1250672/1250672 [01:18<00:00, 15996.65it/s]


In [36]:
if CONFIG.get('n_jac', False):
    def n_jac(train_df, test_df, text_df):
        # Функция для создания биграмм из текста
        def create_bigrams(text):
            bigrams = []
            for i in range(len(text) - 1):
                bigrams.append((text[i], text[i + 1]))
            return bigrams
        
        # Функция для фильтрации биграмм
        def filter_bigrams(bigrams):
            return [(w1, w2) for (w1, w2) in bigrams if re.match(r'\d+', w1)]
        
        def compute_n_jac(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']

            
            words1 = re.sub(r'[^\w\s]', ' ', name1)
            words1 = words1.replace('_', ' ')
            words1 = re.sub(r'\s+', ' ', words1)
            words1 = words1.strip().split()
            
            words2 = re.sub(r'[^\w\s]', ' ', name2)
            words2 = words2.replace('_', ' ')
            words2 = re.sub(r'\s+', ' ', words2)
            words2 = words2.strip().split()
                
            # Создаем биграммы из обоих списков слов
            bigrams1 = create_bigrams(words1)
            bigrams2 = create_bigrams(words2)
            
            # Фильтруем биграммы
            filtered_bigrams1 = filter_bigrams(bigrams1)
            filtered_bigrams2 = filter_bigrams(bigrams2)
            
            length = 0
            counter = 0
            
            for w1, w2 in filtered_bigrams1:
                for x1, x2 in filtered_bigrams2:
                    if w2 == x2:
                        length += 1
                        if w1 == x1:
                            counter += 1
            
            if length != 0:
                return counter / length
            else:
                return -3

        tqdm.pandas(desc="Processing train_df")
        train_df['n_trick'] = train_df.progress_apply(lambda row: compute_n_jac(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['n_trick'] = test_df.progress_apply(lambda row: compute_n_jac(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df

    train, test = n_jac(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [00:38<00:00, 32281.24it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 31867.04it/s]


In [37]:
if CONFIG.get('d_jac', False):
    def n_jac(train_df, test_df, text_df):
        # Функция для создания биграмм из текста
        def create_bigrams(text):
            bigrams = []
            for i in range(len(text) - 1):
                bigrams.append((text[i], text[i + 1]))
            return bigrams
        
        # Функция для фильтрации биграмм
        def filter_bigrams(bigrams):
            return [(w1, w2) for (w1, w2) in bigrams if re.match(r'\d+', w1)]
        
        def compute_n_jac(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'description']
            name2 = text_df.at[variantid2, 'description']
                
            if name1 is None or name2 is None:
                if name1 is None and name2 is None:
                    return -2
                else:
                    return -1

            
            
            words1 = re.sub(r'[^\w\s]', ' ', name1)
            words1 = words1.replace('_', ' ')
            words1 = re.sub(r'\s+', ' ', words1)
            words1 = words1.strip().split()
            
            words2 = re.sub(r'[^\w\s]', ' ', name2)
            words2 = words2.replace('_', ' ')
            words2 = re.sub(r'\s+', ' ', words2)
            words2 = words2.strip().split()
                
            # Создаем биграммы из обоих списков слов
            bigrams1 = create_bigrams(words1)
            bigrams2 = create_bigrams(words2)
            
            # Фильтруем биграммы
            filtered_bigrams1 = filter_bigrams(bigrams1)
            filtered_bigrams2 = filter_bigrams(bigrams2)
            
            length = 0
            counter = 0
            
            for w1, w2 in filtered_bigrams1:
                for x1, x2 in filtered_bigrams2:
                    if w2 == x2:
                        length += 1
                        if w1 == x1:
                            counter += 1
            
            if length != 0:
                return counter / length
            else:
                return -3

        tqdm.pandas(desc="Processing train_df")
        train_df['d_trick'] = train_df.progress_apply(lambda row: compute_n_jac(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['d_trick'] = test_df.progress_apply(lambda row: compute_n_jac(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df

    train, test = n_jac(train, test, text)

Processing train_df: 100%|██████████| 1250672/1250672 [07:32<00:00, 2764.13it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:20<00:00, 2462.38it/s]


In [38]:
from tqdm import tqdm

indexes = []
for index, row in tqdm(train.iterrows(), total=len(train)):

    cat1 = json.loads(attributes.at[row['variantid1'], 'categories'])
    cat2 = json.loads(attributes.at[row['variantid2'], 'categories'])

    if cat1['2'] != cat2['2']:
        indexes.append(index)
train.drop(index=indexes, inplace=True)
train.reset_index(drop=True, inplace=True)

100%|██████████| 1250672/1250672 [00:48<00:00, 25581.40it/s]


In [39]:
if CONFIG.get('n_years', False):
    def n_years(train_df, test_df, text_df):
        
        def compute_n_years(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'name']
            name2 = text_df.at[variantid2, 'name']
                
            if len(name1.replace(' ', '')) == 0  or len(name2.replace(' ', '')) == 0:
                if len(name1.replace(' ', '')) == 0  and len(name2.replace(' ', '')) == 0:
                    return -2
                else:
                    return -1
            
                
            words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            words1 = words1.replace('_', ' ') # Удаление _ символов
            words1 = ' '.join(re.findall(r'\b\d{4}\b', words1))
            words1 = re.sub(r'\s+', ' ', words1) # Удаление лишних пробелов
            words1 = set(words1.strip().split()) # Удаление пробелов в начале и конце
            
            words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            words2 = words2.replace('_', ' ') # Удаление _ символов
            words2 = ' '.join(re.findall(r'\b\d{4}\b', words2))
            words2 = re.sub(r'\s+', ' ', words2) # Удаление лишних пробелов
            words2 = set(words2.strip().split()) # Удаление пробелов в начале и конце
            
            if len(words1) == 0 or len(words2) == 0:
                if len(words1) == 0 and len(words2) == 0:
                    return -4
                else:
                    return -3
            
            intersection = len(words1.intersection(words2))
            union = len(words1.intersection(words2))
            
            
            return intersection / union if union != 0 else 0

        tqdm.pandas(desc="Processing train_df")
        train_df['n_years'] = train_df.progress_apply(lambda row: compute_n_years(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['n_years'] = test_df.progress_apply(lambda row: compute_n_years(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df

    train, test = n_years(train, test, text)

Processing train_df: 100%|██████████| 1250160/1250160 [00:27<00:00, 46042.11it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:01<00:00, 40086.10it/s]


In [40]:
if CONFIG.get('d_years', False):
    def d_years(train_df, test_df, text_df):
        
        def compute_d_years(variantid1, variantid2):
            
            name1 = text_df.at[variantid1, 'description']
            name2 = text_df.at[variantid2, 'description']
                
            if name1 is None or name2 is None:
                if name1 is None and name2 is None:
                    return -2
                else:
                    return -1
                
            if len(name1.replace(' ', '')) == 0  or len(name2.replace(' ', '')) == 0:
                if len(name1.replace(' ', '')) == 0  and len(name2.replace(' ', '')) == 0:
                    return -2
                else:
                    return -1
            
                
            words1 = re.sub(r'[^\w\s]', ' ', name1) # Unicode токены
            words1 = words1.replace('_', ' ') # Удаление _ символов
            words1 = ' '.join(re.findall(r'\b\d{4}\b', words1))
            words1 = re.sub(r'\s+', ' ', words1) # Удаление лишних пробелов
            words1 = set(words1.strip().split()) # Удаление пробелов в начале и конце
            
            words2 = re.sub(r'[^\w\s]', ' ', name2) # Unicode токены
            words2 = words2.replace('_', ' ') # Удаление _ символов
            words2 = ' '.join(re.findall(r'\b\d{4}\b', words2))
            words2 = re.sub(r'\s+', ' ', words2) # Удаление лишних пробелов
            words2 = set(words2.strip().split()) # Удаление пробелов в начале и конце
            
            if len(words1) == 0 or len(words2) == 0:
                if len(words1) == 0 and len(words2) == 0:
                    return -4
                else:
                    return -3
            
            intersection = len(words1.intersection(words2))
            union = len(words1.intersection(words2))
            
            
            return intersection / union if union != 0 else 0

        tqdm.pandas(desc="Processing train_df")
        train_df['d_years'] = train_df.progress_apply(lambda row: compute_d_years(row['variantid1'], row['variantid2']), axis=1)
        
        tqdm.pandas(desc="Processing test_df")
        test_df['d_years'] = test_df.progress_apply(lambda row: compute_d_years(row['variantid1'], row['variantid2']), axis=1)
        
        return train_df, test_df

    train, test = d_years(train, test, text)

Processing train_df: 100%|██████████| 1250160/1250160 [01:39<00:00, 12550.75it/s]
Processing test_df: 100%|██████████| 49620/49620 [00:03<00:00, 16376.42it/s]


In [41]:
train.to_csv('1_my_train.csv', index=False)
test.to_csv('1_my_test.csv', index=False)