In [11]:
import elasticsearch
from elasticsearch import Elasticsearch

import os
from dotenv import load_dotenv
from pprint import pprint

# Elasticsearch

In [12]:
load_dotenv()

True

In [13]:
def connect_to_elasticsearch(
        scheme: str,
        host: str,
        port: int,
        username: str,
        password: str,
) -> Elasticsearch:
    return Elasticsearch(
        hosts=[{
            'scheme': scheme,
            'host': host,
            'port': port
        }],
        basic_auth=(username, password),
        http_compress=True,
        request_timeout=10,
    )


def index_exists(elastic: Elasticsearch, index_name: str) -> bool:
    return elastic.indices.exists(index=index_name)


def show_mapping(index: str):
    mapping = elastic.indices.get_mapping(index=index)
    fields = mapping[os.getenv('ES_INDEX')]['mappings']['properties']
    
    for field in fields:
        pprint(field)
        pprint(fields[field])

In [14]:
index_name = os.getenv('ES_INDEX')

elastic = connect_to_elasticsearch(
    os.getenv('ES_SCHEME'),
    os.getenv('ES_HOST'),
    int(os.getenv('ES_PORT')),
    os.getenv('ES_USERNAME'),
    os.getenv('ES_PASSWORD')
)

assert index_exists(elastic, index_name)

In [15]:
# show_mapping(index_name)

In [16]:
def perform_query(query, limit_names=100) -> dict:

    response = elastic.search(
        index=index_name,
        query=query,
        source=True,
        fields=['metadata.id', 'data.collection_name', 'metadata.members_count', 
                'template.collection_types'],
        script_fields={
            "script_names": {
                "script": {
                    "source": f"params['_source'].data.names.stream()"
                             + f".limit({limit_names})"
                             + ".collect(Collectors.toList())"
                }
            }
        }
    )

    first_hit = response['hits']['hits'][0]
    
    return first_hit

In [17]:
def get_collection_hit_by_id(id_: str):
    return perform_query({'match': {'metadata.id.keyword': id_}})

In [18]:
def get_collection_name_and_names_by_id(id_: str) -> tuple[str, list[str]]:
    hit = perform_query({'match': {'metadata.id.keyword': id_}})
    return (hit['fields']['data.collection_name'][0], 
            [r['normalized_name'] for r in hit['fields']['script_names']])

In [25]:
def get_collection_name_and_names_from_hit(hit: dict) -> tuple[str, list[str]]:
    return (hit['fields']['data.collection_name'][0], 
            [r['normalized_name'] for r in hit['fields']['script_names']])

In [21]:
def get_collection_types_from_hit(hit: dict) -> list[str]:
    return hit['fields']['template.collection_types'][1::2]

# Experiments

In [27]:
import random

random.seed(311) 

In [28]:
def get_normalized_tokenized_tuples(hit: dict) -> list[tuple[str, list[str]]]:
    return [(r['normalized_name'], r['tokenized_name']) for r in hit['fields']['script_names']]

In [29]:
def show_comparison(before_names: list[str], after_names: list[str]):
    for b, a in zip(before_names, after_names):
        print(f'{b: <20} --->\t{a}')

In [30]:
def show_collection_names(hit: dict, n : int | None = None):
    hit_f = hit['fields']
    
    print(f"=== collection: {hit_f['data.collection_name'][0]} ({hit_f['metadata.id'][0]}) ===")
    print(f"members count: {hit_f['metadata.members_count'][0]}")
    print(f"collection types: {hit_f['template.collection_types'][1::2]}")
    
    print('\n' + '====='*12 + '\n')
    
    for n_name, t_name in list(map(lambda r: (r['normalized_name'], r['tokenized_name']), hit_f['script_names']))[:n]:
        print(f'{n_name: <30} {t_name}\n')

In [31]:
c_vegefruits = get_collection_hit_by_id('Q8475074')
show_collection_names(c_vegefruits, n=15)

=== collection: Fruit vegetables (Q8475074) ===
members count: 44.0
collection types: ['taxon']


maize                          ['maize']

tomato                         ['tomato']

eggplant                       ['eggplant']

cucumber                       ['cucumber']

pea                            ['pea']

chayote                        ['chayote']

jalapeno                       ['jalapeno']

capsicum                       ['capsicum']

breadfruit                     ['breadfruit']

calabash                       ['calabash']

tomatillo                      ['tomatillo']

luffa                          ['luffa']

waxgourd                       ['wax', 'gourd']

tinda                          ['tinda']

chiqua                         ['chi', 'qua']



In [32]:
c_pdishes = get_collection_hit_by_id('Q3244225')
show_collection_names(c_pdishes, n=15)

=== collection: Potato dishes (Q3244225) ===
members count: 114.0
collection types: ['potato dish']


curlyfries                     ['curly', 'fries']

poutine                        ['poutine']

potatochip                     ['potato', 'chip']

cottagepie                     ['cottage', 'pie']

frenchfries                    ['french', 'fries']

hashbrowns                     ['hash', 'browns']

rosti                          ['rosti']

kapsalon                       ['kapsalon']

aligot                         ['aligot']

knish                          ['knish']

latka                          ['latka']

latke                          ['latke']

tatertots                      ['tater', 'tots']

kugel                          ['kugel']

cepelinai                      ['cepelinai']



In [33]:
c_lotr = get_collection_hit_by_id('Q1204735')
show_collection_names(c_lotr, n=15)

=== collection: Middle-earth characters (Q1204735) ===
members count: 79.0
collection types: ["character from Tolkien's legendarium"]


sauron                         ['sauron']

galadriel                      ['galadriel']

isildur                        ['isildur']

gandalf                        ['gandalf']

morgoth                        ['morgoth']

aragorn                        ['aragorn']

elrond                         ['elrond']

gollum                         ['gollum']

smeagol                        ['smeagol']

elendil                        ['elendil']

arwen                          ['arwen']

gilgalad                       ['gilgalad']

maia                           ['maia']

hobbit                         ['hobbit']

balrog                         ['balrog']



In [34]:
c_snakes = get_collection_hit_by_id('Q7485198')
show_collection_names(c_snakes, n=15)

=== collection: Venomous snakes (Q7485198) ===
members count: 35.0
collection types: ['taxon']


boomslang                      ['boomslang']

elapidae                       ['elapidae']

tigersnake                     ['tiger', 'snake']

gaboonviper                    ['gaboon', 'viper']

hydrophiinae                   ['hydrophiinae']

dugite                         ['dugite']

twigsnake                      ['twig', 'snake']

acanthophis                    ['acanthophis']

forestcobra                    ['forest', 'cobra']

lachesismuta                   ['lachesis', 'muta']

azemiops                       ['azemiops']

browntreesnake                 ['brown', 'tree', 'snake']

jamesonsmamba                  ['jamesons', 'mamba']

manybandedkrait                ['manybanded', 'krait']

easterncopperhead              ['eastern', 'copperhead']



In [35]:
c_marvel = get_collection_hit_by_id('Q371776')
show_collection_names(c_marvel, n=25)

=== collection: Marvel Comics characters (Q371776) ===
members count: 1935.0
collection types: ['fictional character']


moonknight                     ['moon', 'knight']

spiderman                      ['spiderman']

satan                          ['satan']

archangel                      ['archangel']

namor                          ['namor']

submariner                     ['submariner']

ironman                        ['iron', 'man']

tonystark                      ['tony', 'stark']

ironmaniac                     ['iron', 'maniac']

oshtur                         ['oshtur']

adamandeve                     ['adam', 'and', 'eve']

thanos                         ['thanos']

shehulk                        ['shehulk']

hulk                           ['hulk']

blackbolt                      ['black', 'bolt']

utopia                         ['utopia']

deadpool                       ['deadpool']

demon                          ['demon']

kamalakhan                     ['kamala', 'khan']


## Original tokenization scrambling

In [36]:
from copy import copy

In [37]:
def name_tuples_to_bigrams(name_tuples: list[tuple[str, tuple]]) -> list[tuple[str, str]]:
    left_names = []
    right_names = []
    for _, tokenized_name in name_tuples:
        if len(tokenized_name) == 1:
            pass  # todo: here we could use BigramTokenizer
        elif len(tokenized_name) == 2:
            left_names.append(tokenized_name[0])
            right_names.append(tokenized_name[1])
        elif len(tokenized_name) > 2:
            left_names.append(tokenized_name[0])
            right_names.append(''.join(tokenized_name[1:]))
    return left_names, right_names

In [38]:
def fs_to_zipped(firsts: list[str], seconds: list[str]) -> list[str]:
    return copy(list(map(lambda t: '-'.join(t), zip(firsts, seconds))))

### Top 10 bigrams swap

In [39]:
def top10_bigram_swap(c_hit: dict, topn=10):
    name_tuples = get_normalized_tokenized_tuples(c_hit)
    tokenized_names = [t[1] for t in name_tuples if len(t[1]) == 2][:topn]
    firsts, seconds = map(list, list(zip(*tokenized_names)))
    before = fs_to_zipped(firsts, seconds)
    random.shuffle(seconds)
    after = fs_to_zipped(firsts, seconds)
    show_comparison(before, after)

In [40]:
# fruits
top10_bigram_swap(c_vegefruits)

wax-gourd            --->	wax-anguria
chi-qua              --->	chi-gourd
fresno-chile         --->	fresno-aegyptiaca
blighia-sapida       --->	blighia-chile
asparagus-bean       --->	asparagus-grandis
luffa-aegyptiaca     --->	luffa-peruviana
physalis-peruviana   --->	physalis-sapida
coccinia-grandis     --->	coccinia-qua
cucumis-anguria      --->	cucumis-acutangula
luffa-acutangula     --->	luffa-bean


In [41]:
# dishes
top10_bigram_swap(c_pdishes)

curly-fries          --->	curly-chip
potato-chip          --->	potato-fries
cottage-pie          --->	cottage-browns
french-fries         --->	french-fries
hash-browns          --->	hash-potato
tater-tots           --->	tater-pie
chip-butty           --->	chip-tots
home-fries           --->	home-fries
potato-salad         --->	potato-butty
baked-potato         --->	baked-salad


In [42]:
# lotr
top10_bigram_swap(c_lotr)

tom-bombadil         --->	tom-gamgee
denethor-ii          --->	denethor-baggins
frodo-baggins        --->	frodo-baggins
bilbo-baggins        --->	bilbo-ii
peregrin-took        --->	peregrin-felagund
samwise-gamgee       --->	samwise-bombadil
turin-turambar       --->	turin-turambar
finrod-felagund      --->	finrod-wormtongue
grima-wormtongue     --->	grima-took
thorin-oakenshield   --->	thorin-oakenshield


In [43]:
# snakes
top10_bigram_swap(c_snakes)

tiger-snake          --->	tiger-viper
gaboon-viper         --->	gaboon-krait
twig-snake           --->	twig-insularis
forest-cobra         --->	forest-tigrinus
lachesis-muta        --->	lachesis-snake
jamesons-mamba       --->	jamesons-snake
manybanded-krait     --->	manybanded-cobra
eastern-copperhead   --->	eastern-copperhead
bothrops-insularis   --->	bothrops-muta
rhabdophis-tigrinus  --->	rhabdophis-mamba


In [44]:
# marvel
top10_bigram_swap(c_marvel)

moon-knight          --->	moon-stark
iron-man             --->	iron-bolt
tony-stark           --->	tony-khan
iron-maniac          --->	iron-maniac
black-bolt           --->	black-knight
kamala-khan          --->	kamala-marvel
ms-marvel            --->	ms-rider
peter-parker         --->	peter-parker
doctor-doom          --->	doctor-man
ghost-rider          --->	ghost-doom


### Without skipping non-bigrams (n>2)

In [45]:
def top10_bigram_swap_no_skip(c_hit: dict, topn=10):
    name_tuples = get_normalized_tokenized_tuples(c_hit)
    firsts, seconds = name_tuples_to_bigrams(name_tuples)
    firsts, seconds = firsts[:topn], seconds[:topn]
    before = copy(list(map(lambda t: '-'.join(t), zip(firsts, seconds))))
    random.shuffle(seconds)
    after = list(map(lambda t: '-'.join(t), zip(firsts, seconds)))
    show_comparison(before, after)

In [46]:
# fruits
top10_bigram_swap_no_skip(c_vegefruits)

wax-gourd            --->	wax-aegyptiaca
chi-qua              --->	chi-peruviana
fresno-chile         --->	fresno-chile
blighia-sapida       --->	blighia-anguria
asparagus-bean       --->	asparagus-mexicochile
luffa-aegyptiaca     --->	luffa-grandis
new-mexicochile      --->	new-bean
physalis-peruviana   --->	physalis-gourd
coccinia-grandis     --->	coccinia-qua
cucumis-anguria      --->	cucumis-sapida


In [47]:
# dishes
top10_bigram_swap_no_skip(c_pdishes)

curly-fries          --->	curly-fries
potato-chip          --->	potato-fries
cottage-pie          --->	cottage-salad
french-fries         --->	french-pie
hash-browns          --->	hash-fries
tater-tots           --->	tater-potato
chip-butty           --->	chip-butty
home-fries           --->	home-tots
potato-salad         --->	potato-chip
baked-potato         --->	baked-browns


In [48]:
# lotr
top10_bigram_swap_no_skip(c_lotr)

tom-bombadil         --->	tom-thebowman
denethor-ii          --->	denethor-andidril
frodo-baggins        --->	frodo-felagund
bilbo-baggins        --->	bilbo-turambar
peregrin-took        --->	peregrin-gamgee
samwise-gamgee       --->	samwise-took
tuor-andidril        --->	tuor-bombadil
turin-turambar       --->	turin-ii
bard-thebowman       --->	bard-baggins
finrod-felagund      --->	finrod-baggins


In [49]:
# snakes
top10_bigram_swap_no_skip(c_snakes)

tiger-snake          --->	tiger-cobra
gaboon-viper         --->	gaboon-snake
twig-snake           --->	twig-snake
forest-cobra         --->	forest-insularis
lachesis-muta        --->	lachesis-copperhead
brown-treesnake      --->	brown-krait
jamesons-mamba       --->	jamesons-mamba
manybanded-krait     --->	manybanded-treesnake
eastern-copperhead   --->	eastern-muta
bothrops-insularis   --->	bothrops-viper


In [50]:
# marvel
top10_bigram_swap_no_skip(c_marvel)

moon-knight          --->	moon-maniac
iron-man             --->	iron-parker
tony-stark           --->	tony-knight
iron-maniac          --->	iron-stark
adam-andeve          --->	adam-man
black-bolt           --->	black-khan
kamala-khan          --->	kamala-marvel
ms-marvel            --->	ms-andeve
peter-parker         --->	peter-bolt
doctor-doom          --->	doctor-doom


# Further tokenization scrambling

In [51]:
from nltk.corpus import wordnet as wn

from itertools import chain


class LongestBigramTokenizer:
    """Tokenize concatenation of two words from WordNet (minimize abs(len(b1) - len(b2)) for b1, b2 tokenization)."""

    def __init__(self):
        wn.synsets('dog')
        self.min_word_len = 3

    def get_tokenization(self, word: str) -> tuple[str, str] | None:
        if len(word) <= 5:
            return None
        
        for i in self.generate_indices(word):
            prefix_synsets = wn.synsets(word[:i])
            suffix_synsets = wn.synsets(word[i:])
            if prefix_synsets and suffix_synsets:
                return (word[:i], word[i:])
        
        if wn.synsets(word):
            return (word, '')
        return None

    def generate_indices(self, iterable) -> list[int]:
        mid = len(iterable) // 2
        left_indices = list(reversed(range(mid)))
        right_indices = list(range(mid + 1, len(iterable)))
        limit = max(len(left_indices), len(right_indices)) - self.min_word_len
        return list(chain([mid], *zip(right_indices[:limit], left_indices[:limit])))

In [52]:
tokenizer = LongestBigramTokenizer()

In [53]:
tokenizer.get_tokenization('')

In [54]:
tokenizer.get_tokenization('batman')

('bat', 'man')

In [55]:
tokenizer.get_tokenization('eggplant')

('egg', 'plant')

In [56]:
tokenizer.get_tokenization('submariner')

('sub', 'mariner')

In [57]:
tokenizer.get_tokenization('boomslang')

('boom', 'slang')

In [58]:
tokenizer.get_tokenization('daredevil')

('dare', 'devil')

In [59]:
tokenizer.get_tokenization('juggernaut')

('juggernaut', '')

In [60]:
tokenizer.get_tokenization('jaguar')

('jaguar', '')

In [61]:
tokenizer.get_tokenization('bleblebleble')

In [62]:
tokenizer.get_tokenization('anox')

# Methods comparison report

In [63]:
tokenizer = LongestBigramTokenizer()

def name_tuples_to_bigrams(name_tuples: list[tuple[str, tuple]]) -> tuple[list[str], list[str], list[str]]:
    left_names = []
    right_names = []
    unigrams = []
    for name, tokenized_name in name_tuples:
        if len(tokenized_name) == 1:
            further_tokenized_name = tokenizer.get_tokenization(name)
            if further_tokenized_name is None or further_tokenized_name == (name, ''):
                unigrams.append(name)
            else:
                left_names.append(further_tokenized_name[0])
                right_names.append(further_tokenized_name[1])
        elif len(tokenized_name) == 2:
            left_names.append(tokenized_name[0])
            right_names.append(tokenized_name[1])
        elif len(tokenized_name) > 2:
            left_names.append(tokenized_name[0])
            right_names.append(''.join(tokenized_name[1:]))  # todo: is this approach ok?
    return left_names, right_names, unigrams

In [64]:
from typing import Literal
from copy import copy


def run_token_scramble(
    collection_id: str,
    top_n: int,  
    mixing: Literal['left-right-skip', 'left-right-with-unigrams', 'full-shuffle']
):
    name_tuples = get_normalized_tokenized_tuples(get_collection_hit_by_id(collection_id))
    left_unigrams, right_unigrams, just_unigrams = name_tuples_to_bigrams(name_tuples)
    left_unigrams, right_unigrams, just_unigrams = left_unigrams[:top_n], right_unigrams[:top_n], just_unigrams[:top_n]
    original_bigrams = copy([l + r for l, r in zip(left_unigrams, right_unigrams)])

    def shuffle_right():
        nonlocal left_unigrams, right_unigrams, original_bigrams
        shuffle_count = 0
        while any([left_unigrams[i] + right_unigrams[i] == ob for i, ob in enumerate(original_bigrams)]) \
               and shuffle_count < 10:
            random.shuffle(right_unigrams)
            shuffle_count += 1
    
    if mixing == 'left-right-skip':
        shuffle_right()
        return list(zip(left_unigrams, right_unigrams))
    elif mixing == 'left-right-with-unigrams':
        shuffle_right()
        # insert random unigrams
        unigram_prob = 0.2
        for i in range(len(left_unigrams)):
            if not just_unigrams:
                break
            if random.random() < unigram_prob:
                if random.random() < 0.5:
                    left_unigrams[i] = just_unigrams.pop(0)
                else:
                    right_unigrams[i] = just_unigrams.pop(0)
        return list(zip(left_unigrams, right_unigrams))
    elif mixing == 'full-shuffle':
        all_unigrams = left_unigrams + right_unigrams + just_unigrams
        random.shuffle(all_unigrams)
        all_unigrams = all_unigrams[:2*top_n]
        return list(zip(all_unigrams[::2], all_unigrams[1::2]))
    else:
        raise ValueError('Invalid `mixing` value.')

In [65]:
run_token_scramble('Q371776', top_n=10, mixing='left-right-skip')

[('moon', 'maniac'),
 ('arch', 'mariner'),
 ('sub', 'man'),
 ('iron', 'khan'),
 ('tony', 'angel'),
 ('iron', 'andeve'),
 ('adam', 'bolt'),
 ('black', 'stark'),
 ('dead', 'knight'),
 ('kamala', 'pool')]

In [66]:
run_token_scramble('Q371776', top_n=10, mixing='left-right-with-unigrams')

[('moon', 'pool'),
 ('arch', 'andeve'),
 ('sub', 'angel'),
 ('iron', 'bolt'),
 ('tony', 'knight'),
 ('spiderman', 'khan'),
 ('adam', 'maniac'),
 ('black', 'stark'),
 ('dead', 'mariner'),
 ('kamala', 'man')]

In [67]:
run_token_scramble('Q371776', top_n=10, mixing='full-shuffle')

[('mariner', 'iron'),
 ('iron', 'spiderman'),
 ('satan', 'moon'),
 ('stark', 'namor'),
 ('arch', 'tony'),
 ('angel', 'andeve'),
 ('dead', 'thanos'),
 ('adam', 'pool'),
 ('man', 'maniac'),
 ('hulk', 'utopia')]

In [68]:
import csv
from collections import defaultdict


def load_collection_ids() -> dict:
    filnename = 'test_collections.csv'
    res_dict = defaultdict(list)
    with open(filnename) as f:
        csv_reader = csv.reader(f, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if (id_ := row[6]) and (type_ := row[7]) and type_ != 'type':
                type_tuple = tuple(set(type_.strip().split(", ")))
                res_dict[type_tuple].append(id_)
    
    for type_tuple in res_dict.keys():
        res_dict[type_tuple] = list(set(res_dict[type_tuple]))
    return res_dict

In [69]:
def run_comparison() -> str:
    html = ['<meta charset="UTF-8">']
    
    collection_type2ids_dict = load_collection_ids()
    for type_tuple in sorted(collection_type2ids_dict.keys()):
        print(f'processing collections of type: {type_tuple} with ids: {collection_type2ids_dict[type_tuple]}')
        html.append(f'<h2>types: {", ".join(type_tuple)}</h2>')
        for c_id in collection_type2ids_dict[type_tuple]:
            try:
                c_name, c_names = get_collection_name_and_names_by_id(c_id)
            except IndexError:
                continue
            html.append(f'<h3>{c_name} ({c_id})</h3>')
            html.append(f'<br><b>top 25 names in collection:</b> {" ".join(c_names[:25])}</br>')
            
            for topn in (10, 20):
                html.append(f'<h4>top {topn} names token-scramble:</h4>')
                html.append('<table>')
                html.append('<thead>')
                html.append('''<tr>
                                <th style="width: 34%">left-right-shuffle</th>
                                <th style="width: 33%">l-r-shuffle-with-unigrams</th>
                                <th style="width: 33%">full-shuffle</th>
                            </tr>''')
                html.append('</thead>')

                bigrams_lr = run_token_scramble(c_id, top_n=topn, mixing='left-right-skip')
                bigrams_lr_unigrams = run_token_scramble(c_id, top_n=topn, mixing='left-right-with-unigrams')
                bigrams_full = run_token_scramble(c_id, top_n=topn, mixing='full-shuffle')
                
                html.append('<tbody>')

                def bigrams_to_str(bgrms: list[tuple[str, str]]) -> str:
                    return '  '.join(map(lambda t: f'{t[0]}-{t[1]}', bgrms))
                
                html.append((f'<tr><td>{bigrams_to_str(bigrams_lr)}</td><td>{bigrams_to_str(bigrams_lr_unigrams)}</td>'
                             f'<td>{bigrams_to_str(bigrams_full)}</td></tr>'))
                
                html.append('</tbody>')
                html.append('</table>')
    
    return '\n'.join(html)

In [55]:
html_str = run_comparison()

processing collections of type: ('Aztec deity',) with ids: ['Q3032304']
processing collections of type: ('CubeSat',) with ids: ['Q2976857']
processing collections of type: ('European cuisine',) with ids: ['Q17067317']
processing collections of type: ('ISO standard',) with ids: ['Q749445']
processing collections of type: ('Internet meme',) with ids: ['Q6023923']
processing collections of type: ('Japanese era name',) with ids: ['Q1847640']
processing collections of type: ('Latin phrase',) with ids: ['Q87248', 'Q1435931', 'Q1477505', 'Q3409821']
processing collections of type: ('Mayan deity',) with ids: ['Q129244']
processing collections of type: ('Mediterranean country',) with ids: ['Q4809248']
processing collections of type: ('New Gods',) with ids: ['Q278238']
processing collections of type: ('Notorious Markets',) with ids: ['Q105081725']
processing collections of type: ('United States executive order', 'presidential directive') with ids: ['Q104922747']
processing collections of type: (

In [56]:
with open('token-scramble-report.html', 'w') as f:
    f.write(html_str)

**Comparison 2**

In [70]:
with open('test_collections_2.txt') as f:
    collections_ids_2 = list(map(lambda s: s.strip(), f.readlines()))

In [71]:
collections_ids_2[:5]

['Q6562472', 'Q8247879', 'Q839198', 'Q6353228', 'Q1845766']

In [75]:
def run_comparison2() -> str:
    html = ['<meta charset="UTF-8">']
    
    collection_type2ids_dict = load_collection_ids()
#    for type_tuple in sorted(collection_type2ids_dict.keys()):
#        print(f'processing collections of type: {type_tuple} with ids: {collection_type2ids_dict[type_tuple]}')
#        html.append(f'<h2>types: {", ".join(type_tuple)}</h2>')
    for c_id in collections_ids_2:
        try:
            c_hit = get_collection_hit_by_id(c_id)
            c_types = get_collection_types_from_hit(c_hit)
            c_name, c_names = get_collection_name_and_names_from_hit(c_hit)
            print(f'processing collection {c_name} with id {c_id}')
        except IndexError:
            continue
        html.append(f'<h3>{c_name} ({c_id}), tyype: {" ,".join(c_types)}</h3>')
        html.append(f'<br><b>top 25 names in collection:</b> {" ".join(c_names[:25])}</br>')
        
        for topn in (10, 20):
            html.append(f'<h4>top {topn} names token-scramble:</h4>')
            html.append('<table>')
            html.append('<thead>')
            html.append('''<tr>
                            <th style="width: 34%">left-right-shuffle</th>
                            <th style="width: 33%">l-r-shuffle-with-unigrams</th>
                            <th style="width: 33%">full-shuffle</th>
                        </tr>''')
            html.append('</thead>')

            bigrams_lr = run_token_scramble(c_id, top_n=topn, mixing='left-right-skip')
            bigrams_lr_unigrams = run_token_scramble(c_id, top_n=topn, mixing='left-right-with-unigrams')
            bigrams_full = run_token_scramble(c_id, top_n=topn, mixing='full-shuffle')
            
            html.append('<tbody>')

            def bigrams_to_str(bgrms: list[tuple[str, str]]) -> str:
                return '  '.join(map(lambda t: f'{t[0]}-{t[1]}', bgrms))
            
            html.append((f'<tr><td>{bigrams_to_str(bigrams_lr)}</td><td>{bigrams_to_str(bigrams_lr_unigrams)}</td>'
                         f'<td>{bigrams_to_str(bigrams_full)}</td></tr>'))
            
            html.append('</tbody>')
            html.append('</table>')
    
    return '\n'.join(html)

In [76]:
html_str_2 = run_comparison2()

processing collection Assemblies of God people with id Q6562472
processing collection American rock guitarists with id Q8247879
processing collection Polish painters with id Q839198
processing collection Footballers with 400 or more La Liga appearances with id Q6353228
processing collection Artists who reached number one in the United States with id Q1845766
processing collection Britney Spears albums with id Q6843306
processing collection Pink Floyd live albums with id Q8763088
processing collection Films based on English-language comics with id Q1741678
processing collection 2012 films with id Q6582902
processing collection Fruits originating in Asia with id Q8475079
processing collection Fruit vegetables with id Q8475074
processing collection Tropical fruit with id Q7461140
processing collection Songs recorded by Britney Spears with id Q1278951
processing collection Bryan Adams songs with id Q8317026
processing collection Iron Man enemies with id Q1660472
processing collection The I

In [77]:
with open('token-scramble-report2.html', 'w') as f:
    f.write(html_str_2)