In [1]:
import elasticsearch
from elasticsearch import Elasticsearch

import os
from dotenv import load_dotenv
from pprint import pprint

# Elasticsearch

In [8]:
load_dotenv()

True

In [11]:
def connect_to_elasticsearch(
        scheme: str,
        host: str,
        port: int,
        username: str,
        password: str,
) -> Elasticsearch:
    return Elasticsearch(
        hosts=[{
            'scheme': scheme,
            'host': host,
            'port': port
        }],
        basic_auth=(username, password),
        http_compress=True,
        request_timeout=10,
    )


def index_exists(elastic: Elasticsearch, index_name: str) -> bool:
    return elastic.indices.exists(index=index_name)


def show_mapping(index: str):
    mapping = elastic.indices.get_mapping(index=index)
    fields = mapping[os.getenv('ES_INDEX')]['mappings']['properties']
    
    for field in fields:
        pprint(field)
        pprint(fields[field])

In [12]:
index_name = os.getenv('ES_INDEX')

elastic = connect_to_elasticsearch(
    os.getenv('ES_SCHEME'),
    os.getenv('ES_HOST'),
    int(os.getenv('ES_PORT')),
    os.getenv('ES_USERNAME'),
    os.getenv('ES_PASSWORD')
)

assert index_exists(elastic, index_name)

In [177]:
# show_mapping(index_name)

In [124]:
def perform_query(query, limit_names=100) -> dict:

    response = elastic.search(
        index=index_name,
        query=query,
        source=True,
        fields=['metadata.id', 'data.collection_name', 'metadata.members_count', 
                'template.collection_types'],
        script_fields={
            "script_names": {
                "script": {
                    "source": f"params['_source'].data.names.stream()"
                             + f".limit({limit_names})"
                             + ".collect(Collectors.toList())"
                }
            }
        }
    )

    first_hit = response['hits']['hits'][0]
    
    return first_hit

In [125]:
def get_collection_hit_by_id(id_: str):
    return perform_query({'match': {'metadata.id.keyword': id_}})

# Experiments

In [194]:
import random

random.seed(311) 

In [136]:
def get_normalized_tokenized_tuples(hit: dict) -> list[tuple[str, list[str]]]:
    return [(r['normalized_name'], r['tokenized_name']) for r in hit['fields']['script_names']]

In [240]:
def show_comparison(before_names: list[str], after_names: list[str]):
    for b, a in zip(before_names, after_names):
        print(f'{b: <20} --->\t{a}')

In [126]:
def show_collection_names(hit: dict, n : int | None = None):
    hit_f = hit['fields']
    
    print(f"=== collection: {hit_f['data.collection_name'][0]} ({hit_f['metadata.id'][0]}) ===")
    print(f"members count: {hit_f['metadata.members_count'][0]}")
    print(f"collection types: {hit_f['template.collection_types'][1::2]}")
    
    print('\n' + '====='*12 + '\n')
    
    for n_name, t_name in map(lambda r: (r['normalized_name'], r['tokenized_name']), hit_f['script_names']):
        print(f'{n_name: <30} {t_name}\n')

In [129]:
c_vegefruits = get_collection_hit_by_id('Q8475074')
show_collection_names(c_vegefruits)

=== collection: Fruit vegetables (Q8475074) ===
members count: 44.0
collection types: ['taxon']


maize                          ['maize']

tomato                         ['tomato']

eggplant                       ['eggplant']

cucumber                       ['cucumber']

pea                            ['pea']

chayote                        ['chayote']

jalapeno                       ['jalapeno']

capsicum                       ['capsicum']

breadfruit                     ['breadfruit']

calabash                       ['calabash']

tomatillo                      ['tomatillo']

luffa                          ['luffa']

waxgourd                       ['wax', 'gourd']

tinda                          ['tinda']

chiqua                         ['chi', 'qua']

cordia                         ['cordia']

fresnochile                    ['fresno', 'chile']

blighiasapida                  ['blighia', 'sapida']

asparagusbean                  ['asparagus', 'bean']

luffaaegyptiaca                [

In [192]:
c_pdishes = get_collection_hit_by_id('Q3244225')
show_collection_names(c_pdishes)

=== collection: Potato dishes (Q3244225) ===
members count: 114.0
collection types: ['potato dish']


curlyfries                     ['curly', 'fries']

poutine                        ['poutine']

potatochip                     ['potato', 'chip']

cottagepie                     ['cottage', 'pie']

frenchfries                    ['french', 'fries']

hashbrowns                     ['hash', 'browns']

rosti                          ['rosti']

kapsalon                       ['kapsalon']

aligot                         ['aligot']

knish                          ['knish']

latka                          ['latka']

latke                          ['latke']

tatertots                      ['tater', 'tots']

kugel                          ['kugel']

cepelinai                      ['cepelinai']

lefse                          ['lefse']

colcannon                      ['colcannon']

chipbutty                      ['chip', 'butty']

homefries                      ['home', 'fries']

baeckeoffe      

In [131]:
c_lotr = get_collection_hit_by_id('Q1204735')
show_collection_names(c_lotr)

=== collection: Middle-earth characters (Q1204735) ===
members count: 79.0
collection types: ["character from Tolkien's legendarium"]


sauron                         ['sauron']

galadriel                      ['galadriel']

isildur                        ['isildur']

gandalf                        ['gandalf']

morgoth                        ['morgoth']

aragorn                        ['aragorn']

elrond                         ['elrond']

gollum                         ['gollum']

smeagol                        ['smeagol']

elendil                        ['elendil']

arwen                          ['arwen']

gilgalad                       ['gilgalad']

maia                           ['maia']

hobbit                         ['hobbit']

balrog                         ['balrog']

balrogs                        ['balrogs']

vala                           ['vala']

wizard                         ['wizard']

saruman                        ['saruman']

legolas                        ['legola

In [132]:
c_snakes = get_collection_hit_by_id('Q7485198')
show_collection_names(c_snakes)

=== collection: Venomous snakes (Q7485198) ===
members count: 35.0
collection types: ['taxon']


boomslang                      ['boomslang']

elapidae                       ['elapidae']

tigersnake                     ['tiger', 'snake']

gaboonviper                    ['gaboon', 'viper']

hydrophiinae                   ['hydrophiinae']

dugite                         ['dugite']

twigsnake                      ['twig', 'snake']

acanthophis                    ['acanthophis']

forestcobra                    ['forest', 'cobra']

lachesismuta                   ['lachesis', 'muta']

azemiops                       ['azemiops']

browntreesnake                 ['brown', 'tree', 'snake']

jamesonsmamba                  ['jamesons', 'mamba']

manybandedkrait                ['manybanded', 'krait']

easterncopperhead              ['eastern', 'copperhead']

bothropsinsularis              ['bothrops', 'insularis']

rhabdophistigrinus             ['rhabdophis', 'tigrinus']

rednapedsnake            

In [187]:
c_marvel = get_collection_hit_by_id('Q371776')
show_collection_names(c_marvel)

=== collection: Marvel Comics characters (Q371776) ===
members count: 1935.0
collection types: ['fictional character']


moonknight                     ['moon', 'knight']

spiderman                      ['spiderman']

satan                          ['satan']

archangel                      ['archangel']

namor                          ['namor']

submariner                     ['submariner']

ironman                        ['iron', 'man']

tonystark                      ['tony', 'stark']

ironmaniac                     ['iron', 'maniac']

oshtur                         ['oshtur']

adamandeve                     ['adam', 'and', 'eve']

thanos                         ['thanos']

shehulk                        ['shehulk']

hulk                           ['hulk']

blackbolt                      ['black', 'bolt']

utopia                         ['utopia']

deadpool                       ['deadpool']

demon                          ['demon']

kamalakhan                     ['kamala', 'khan']


## Original tokenization scrambling

In [219]:
from copy import copy

In [267]:
def name_tuples_to_bigrams(name_tuples: list[tuple[str, tuple]]) -> list[tuple[str, str]]:
    left_names = []
    right_names = []
    for _, tokenized_name in name_tuples:
        if len(tokenized_name) == 1:
            pass  # todo: here we could use BigramTokenizer
        elif len(tokenized_name) == 2:
            left_names.append(tokenized_name[0])
            right_names.append(tokenized_name[1])
        elif len(tokenized_name) > 2:
            left_names.append(tokenized_name[0])
            right_names.append(''.join(tokenized_name[1:]))
    return left_names, right_names

In [268]:
def fs_to_zipped(firsts: list[str], seconds: list[str]) -> list[str]:
    return copy(list(map(lambda t: '-'.join(t), zip(firsts, seconds))))

### Top 10 bigrams swap

In [269]:
def top10_bigram_swap(c_hit: dict, topn=10):
    name_tuples = get_normalized_tokenized_tuples(c_hit)
    tokenized_names = [t[1] for t in name_tuples if len(t[1]) == 2][:topn]
    firsts, seconds = map(list, list(zip(*tokenized_names)))
    before = fs_to_zipped(firsts, seconds)
    random.shuffle(seconds)
    after = fs_to_zipped(firsts, seconds)
    show_comparison(before, after)

In [271]:
# fruits
top10_bigram_swap(c_vegefruits)

wax-gourd            --->	wax-aegyptiaca
chi-qua              --->	chi-anguria
fresno-chile         --->	fresno-chile
blighia-sapida       --->	blighia-peruviana
asparagus-bean       --->	asparagus-grandis
luffa-aegyptiaca     --->	luffa-sapida
physalis-peruviana   --->	physalis-qua
coccinia-grandis     --->	coccinia-bean
cucumis-anguria      --->	cucumis-gourd
luffa-acutangula     --->	luffa-acutangula


In [272]:
# dishes
top10_bigram_swap(c_pdishes)

curly-fries          --->	curly-potato
potato-chip          --->	potato-fries
cottage-pie          --->	cottage-fries
french-fries         --->	french-tots
hash-browns          --->	hash-pie
tater-tots           --->	tater-browns
chip-butty           --->	chip-butty
home-fries           --->	home-salad
potato-salad         --->	potato-chip
baked-potato         --->	baked-fries


In [273]:
# lotr
top10_bigram_swap(c_lotr)

tom-bombadil         --->	tom-felagund
denethor-ii          --->	denethor-bombadil
frodo-baggins        --->	frodo-baggins
bilbo-baggins        --->	bilbo-turambar
peregrin-took        --->	peregrin-oakenshield
samwise-gamgee       --->	samwise-ii
turin-turambar       --->	turin-took
finrod-felagund      --->	finrod-baggins
grima-wormtongue     --->	grima-wormtongue
thorin-oakenshield   --->	thorin-gamgee


In [274]:
# snakes
top10_bigram_swap(c_snakes)

tiger-snake          --->	tiger-tigrinus
gaboon-viper         --->	gaboon-cobra
twig-snake           --->	twig-krait
forest-cobra         --->	forest-muta
lachesis-muta        --->	lachesis-mamba
jamesons-mamba       --->	jamesons-viper
manybanded-krait     --->	manybanded-snake
eastern-copperhead   --->	eastern-snake
bothrops-insularis   --->	bothrops-insularis
rhabdophis-tigrinus  --->	rhabdophis-copperhead


In [275]:
# marvel
top10_bigram_swap(c_marvel)

moon-knight          --->	moon-bolt
iron-man             --->	iron-khan
tony-stark           --->	tony-stark
iron-maniac          --->	iron-parker
black-bolt           --->	black-man
kamala-khan          --->	kamala-maniac
ms-marvel            --->	ms-knight
peter-parker         --->	peter-rider
doctor-doom          --->	doctor-marvel
ghost-rider          --->	ghost-doom


### Without skipping non-bigrams (n>2)

In [276]:
def top10_bigram_swap_no_skip(c_hit: dict, topn=10):
    name_tuples = get_normalized_tokenized_tuples(c_hit)
    firsts, seconds = name_tuples_to_bigrams(name_tuples)
    firsts, seconds = firsts[:topn], seconds[:topn]
    before = copy(list(map(lambda t: '-'.join(t), zip(firsts, seconds))))
    random.shuffle(seconds)
    after = list(map(lambda t: '-'.join(t), zip(firsts, seconds)))
    show_comparison(before, after)

In [277]:
# fruits
top10_bigram_swap_no_skip(c_vegefruits)

wax-gourd            --->	wax-aegyptiaca
chi-qua              --->	chi-grandis
fresno-chile         --->	fresno-anguria
blighia-sapida       --->	blighia-qua
asparagus-bean       --->	asparagus-peruviana
luffa-aegyptiaca     --->	luffa-sapida
new-mexicochile      --->	new-mexicochile
physalis-peruviana   --->	physalis-bean
coccinia-grandis     --->	coccinia-gourd
cucumis-anguria      --->	cucumis-chile


In [278]:
# dishes
top10_bigram_swap_no_skip(c_pdishes)

curly-fries          --->	curly-fries
potato-chip          --->	potato-fries
cottage-pie          --->	cottage-potato
french-fries         --->	french-browns
hash-browns          --->	hash-tots
tater-tots           --->	tater-fries
chip-butty           --->	chip-butty
home-fries           --->	home-pie
potato-salad         --->	potato-salad
baked-potato         --->	baked-chip


In [279]:
# lotr
top10_bigram_swap_no_skip(c_lotr)

tom-bombadil         --->	tom-turambar
denethor-ii          --->	denethor-gamgee
frodo-baggins        --->	frodo-baggins
bilbo-baggins        --->	bilbo-felagund
peregrin-took        --->	peregrin-bombadil
samwise-gamgee       --->	samwise-ii
tuor-andidril        --->	tuor-andidril
turin-turambar       --->	turin-baggins
bard-thebowman       --->	bard-thebowman
finrod-felagund      --->	finrod-took


In [280]:
# snakes
top10_bigram_swap_no_skip(c_snakes)

tiger-snake          --->	tiger-cobra
gaboon-viper         --->	gaboon-insularis
twig-snake           --->	twig-treesnake
forest-cobra         --->	forest-krait
lachesis-muta        --->	lachesis-muta
brown-treesnake      --->	brown-mamba
jamesons-mamba       --->	jamesons-snake
manybanded-krait     --->	manybanded-snake
eastern-copperhead   --->	eastern-viper
bothrops-insularis   --->	bothrops-copperhead


In [281]:
# marvel
top10_bigram_swap_no_skip(c_marvel)

moon-knight          --->	moon-knight
iron-man             --->	iron-doom
tony-stark           --->	tony-stark
iron-maniac          --->	iron-andeve
adam-andeve          --->	adam-khan
black-bolt           --->	black-parker
kamala-khan          --->	kamala-man
ms-marvel            --->	ms-maniac
peter-parker         --->	peter-marvel
doctor-doom          --->	doctor-bolt


# Deeper tokenization scrambling

In [295]:
from nltk.corpus import wordnet as wn

class BigramWordnetTokenizer:
    """Tokenize concatenation of two words from WordNet."""

    def __init__(self):
        wn.synsets('dog')

    def tokenize(self, word: str) -> list[tuple[str, ...]]:
        result = []
        if wn.synsets(word):
            result.append((word,))

        for i in range(1, len(word)):
            prefix_synsets = wn.synsets(word[:i])
            suffix_synsets = wn.synsets(word[i:])

            if prefix_synsets and suffix_synsets:
                result.append((word[:i], word[i:]))

        return result

    def tokenize_best_result(self, word: str) -> tuple[str, ...] | None:
        res = self.tokenize(word)
        half_len = len(word) / 2
        best_tokenization = None

        score = lambda tok: abs(len(tok) - half_len)
        
        for tokenization in res:
            if word in tokenization:
                continue
            if not best_tokenization or score(tokenization) < score(best_tokenization):
                best_tokenization = tokenization
        return best_tokenization

In [296]:
tokenizer = BigramWordnetTokenizer()

In [297]:
tokenizer.tokenize_best_result('pineapple')

('pine', 'apple')

In [300]:
tokenizer.tokenize_best_result('batman')

('bat', 'man')

In [299]:
tokenizer.tokenize_best_result('eggplant')

('egg', 'plant')

In [298]:
tokenizer.tokenize_best_result('submariner')

('sub', 'mariner')

In [301]:
tokenizer.tokenize_best_result('boomslang')

('boom', 'slang')

In [302]:
tokenizer.tokenize_best_result('daredevil')

('dare', 'devil')

In [305]:
tokenizer.tokenize_best_result('juggernaut')

In [None]:
# todo: use our bigrams?