In [None]:
!pip install transformers sentencepiece sentence-transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
from transformers import (
    AutoModel,
    AutoTokenizer
)
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
import numpy as np
import random
import sys
import os
import re

In [None]:
def seed_everything(seed):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from numba import njit


@njit
def cosine_similarity(a, b) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
def clean_text(text: str) -> str:
    bruh = {
        '(Г|г)(ор\.|ород|\.)': ' город ',
        '[Уу](л.|лица|ул)\s': ' улица ',
        '[Пп](роспект|р\.\s|р-кт\s|р\sкт\s|р\-т\s)': ' проспект ',
        '[Рр](айон|-н\s|-он\s)': ' район ',
        '[Пп](ос\.\s|оселок\s|осёлок\s)': ' поселок ',
        '[сC](танция\s|т\.\s|ст\s)': ' станция ',
        '[Уу](часток\s|ч\.\s|ч\s) ': ' участок ',
        '[Пп](ереулок\s|ер\.\s|ер\s)'
        '[Рр](ека\s|\.\s|\s)': ' река ',
        '[Пп](ер\.|ер|ереулок)\s': ' переулок ',
        '\s[Дд](ом|\.|)\s': ' дом ',
        '\s[Кк](орпус|\.|)\s': ' корпус ',
        '[Сс]тр(\.|оение|оен|оен\.)\s': ' строение ',
        '[Дд](ор\.|орога)\s': ' дорога ',
        '[Лл](итера|\.|ит\.)\s': ' литера ',
        '[Дд](еревня|ер\.)\s': ' деревня ',
        '[Бб](ульв\.|уль\.|ульвар)\s': ' бульвар ',
        '[Нн](аб\.|абереж\.|абережная)\s': ' набережная ',
        '[Сс](адовоство|сад\.|садовод\.)\s': ' садоводство ',
        'с(\/|\s*|\-|\.|)т(\.|)\s': ' Садоводческое товарищество ',
        '[Оо](стров|\.)\s': ' остров ',
        '[Кк](анал|ан\.)\s': ' канал',
        '[Жж](\/|\s*|\-|\.|)д(\.|)\s': ' железная дорога ',
        ' свх\.\s': ' совхоз ',
        ' потреб\. кооператив садоводства\s': ' потребительский кооператив садоводства ',
    }

    def substraction(a):
        a = re.sub('(Г|г|)(ор\.*|ород|\.|)\s*[Сс]анкт(-|\s*)[Пп]етербург', '', a)
        for i in bruh.keys():
            a = re.sub(i, bruh[i], a)
        a = a.lstrip().rstrip().lower()
        return re.sub(r'[-.?!)(,:\/\\]', ' ', a)

    return substraction(text)

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').to(device)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
def make_labse(df, col, model=model):
  df_model = 'MiniLM'
  text_features = []
  for sentence in tqdm(df[col]):
    sentence_embeddings = list(model.encode([sentence]))
    text_features.extend(sentence_embeddings)
  text_features_df = pd.DataFrame(text_features, columns = [f'{df_model}_{col}_feature_{i}' for i in range(len(text_features[0]))])
  return text_features_df

def make_only_one_labse(address, model):
    sentence_embeddings = list(model.encode([address]))
    return sentence_embeddings[0]


In [None]:
address = 'пр-кт Жукова, д. 30А'
address = clean_text(address)

same_address = 'жковаааааааааа 30А'
same_address = clean_text(same_address)

In [None]:
%%time
embs = make_only_one_labse(address, model)
same_embs = make_only_one_labse(same_address, model)

CPU times: user 22.2 ms, sys: 0 ns, total: 22.2 ms
Wall time: 21.9 ms


In [None]:
%%time
cosine_similarity(embs, same_embs)

CPU times: user 16 µs, sys: 3 µs, total: 19 µs
Wall time: 21.9 µs


0.7678707838058472

In [None]:
import pandas as pd
df = pd.read_csv('/content/buildings.csv')

  df = pd.read_csv('/content/buildings.csv')


In [None]:
df.head(2)

Unnamed: 0,id,prefix_id,district_id,house,corpus,liter,villa,parcel,full_address,is_updated,is_actual,type,municipality_id,short_address,post_prefix,build_number
0,56343,11132,35,12,,А,,,город пушкин кедринская улица дом 12,True,False,,107.0,"г.Пушкин, Кедринская ул., д. 12",,
1,595,6987,38,4Б,,,,,поселок ушково пляжевая улица дом 4б,True,False,,128.0,"пос. Ушково, Пляжевая ул., д. 4Б",,


In [None]:
filter_dataset = pd.DataFrame()

In [None]:
filter_dataset['is_actual'] = df['is_actual']
filter_dataset['id'] = df['id']

In [None]:
df['full_address'][149134] = df['short_address'][149134]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['full_address'][149134] = df['short_address'][149134]


In [None]:
feature_table = make_labse(df, 'full_address')

  0%|          | 0/166645 [00:00<?, ?it/s]

In [None]:
tqdm.pandas()
filter_dataset['embeddings'] = df['full_address'].progress_apply(make_labse)

  0%|          | 0/166645 [00:00<?, ?it/s]

In [None]:
filter_dataset.to_csv('all_mlni_enmbeddings.csv')

In [None]:
filter_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166645 entries, 0 to 166644
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   is_actual   166645 non-null  bool  
 1   id          166645 non-null  int64 
 2   embeddings  166645 non-null  object
dtypes: bool(1), int64(1), object(1)
memory usage: 2.7+ MB
