In [1]:
from typing import List, Tuple
import random
import pandas as pd
import numpy as np

In [2]:
RANDOM_SEED = 73685

In [3]:
CONSONANTS = ['f', 'k', 'm', 'n', 'p', 's', 't']
VOWELS = ['a', 'e', 'i', 'o', 'u']

In [4]:
NUMBER_OF_REPETITIONS = 4
WORDS_PER_GROUP = 4
REDUNDANT_WORDS_PER_GROUP = 1 # Each fifth word should be chosen such that it can be ignored later

In [5]:
def generate_unique_words() -> List[str]:
    words: List[str] = []
    for first_consonant in CONSONANTS:
        for vowel in VOWELS:
            for second_consonant in CONSONANTS:
                words.append(f'{first_consonant}{vowel}{second_consonant}')
    return words

In [6]:
def sort_randomly(words: List[str]) -> List[str]:
    random.seed(RANDOM_SEED)
    random.shuffle(words)
    return words

In [7]:
def generate_all_words(repetitions: int, words_fn, sort_fn) -> List[str]:
    all_words: List[str] = []
    for i in range(repetitions):
        words = words_fn()
        words = sort_fn(words)
        all_words.extend(words)
    return all_words

In [8]:
all_words = generate_all_words(NUMBER_OF_REPETITIONS, generate_unique_words, sort_randomly)
all_words

['sof',
 'nuk',
 'fon',
 'mim',
 'pom',
 'kom',
 'mep',
 'kip',
 'fot',
 'paf',
 'kaf',
 'tok',
 'fis',
 'pam',
 'kak',
 'nef',
 'kek',
 'sos',
 'maf',
 'pan',
 'fup',
 'set',
 'nit',
 'fit',
 'kik',
 'fak',
 'kep',
 'tas',
 'mas',
 'mop',
 'mun',
 'sot',
 'kut',
 'nus',
 'tup',
 'nas',
 'sif',
 'saf',
 'nos',
 'fip',
 'pup',
 'sap',
 'pus',
 'nop',
 'sop',
 'tum',
 'mat',
 'kem',
 'pif',
 'mek',
 'tik',
 'mof',
 'pat',
 'sik',
 'muf',
 'kus',
 'tuk',
 'fus',
 'kes',
 'pep',
 'sum',
 'mik',
 'tet',
 'met',
 'nen',
 'sek',
 'tot',
 'sep',
 'faf',
 'fas',
 'nek',
 'tam',
 'sim',
 'nes',
 'sup',
 'tus',
 'puk',
 'sun',
 'fap',
 'kun',
 'kok',
 'ken',
 'ket',
 'fef',
 'fop',
 'som',
 'tuf',
 'muk',
 'kat',
 'nak',
 'sis',
 'tim',
 'pip',
 'tom',
 'pok',
 'nam',
 'mip',
 'nan',
 'pun',
 'tos',
 'mef',
 'mes',
 'sin',
 'men',
 'sak',
 'sen',
 'nuf',
 'tun',
 'kot',
 'pet',
 'tut',
 'tak',
 'mit',
 'non',
 'mut',
 'map',
 'tip',
 'pit',
 'mem',
 'fin',
 'pos',
 'kim',
 'pik',
 'sok',
 'kis',


In [9]:
len(all_words)

980

In [10]:
def split_into_groups(words: List[str], words_per_group: int) -> List[List[str]]:
    groups: List[List[str]] = []
    for i in range(0, len(words), words_per_group):
        groups.append(words[i:i+words_per_group])
    return groups

In [11]:
groups_of_words = split_into_groups(all_words, WORDS_PER_GROUP)
groups_of_words

[['sof', 'nuk', 'fon', 'mim'],
 ['pom', 'kom', 'mep', 'kip'],
 ['fot', 'paf', 'kaf', 'tok'],
 ['fis', 'pam', 'kak', 'nef'],
 ['kek', 'sos', 'maf', 'pan'],
 ['fup', 'set', 'nit', 'fit'],
 ['kik', 'fak', 'kep', 'tas'],
 ['mas', 'mop', 'mun', 'sot'],
 ['kut', 'nus', 'tup', 'nas'],
 ['sif', 'saf', 'nos', 'fip'],
 ['pup', 'sap', 'pus', 'nop'],
 ['sop', 'tum', 'mat', 'kem'],
 ['pif', 'mek', 'tik', 'mof'],
 ['pat', 'sik', 'muf', 'kus'],
 ['tuk', 'fus', 'kes', 'pep'],
 ['sum', 'mik', 'tet', 'met'],
 ['nen', 'sek', 'tot', 'sep'],
 ['faf', 'fas', 'nek', 'tam'],
 ['sim', 'nes', 'sup', 'tus'],
 ['puk', 'sun', 'fap', 'kun'],
 ['kok', 'ken', 'ket', 'fef'],
 ['fop', 'som', 'tuf', 'muk'],
 ['kat', 'nak', 'sis', 'tim'],
 ['pip', 'tom', 'pok', 'nam'],
 ['mip', 'nan', 'pun', 'tos'],
 ['mef', 'mes', 'sin', 'men'],
 ['sak', 'sen', 'nuf', 'tun'],
 ['kot', 'pet', 'tut', 'tak'],
 ['mit', 'non', 'mut', 'map'],
 ['tip', 'pit', 'mem', 'fin'],
 ['pos', 'kim', 'pik', 'sok'],
 ['kis', 'naf', 'sit', 'nat'],
 ['fep',

In [12]:
def add_redundant_word_to_groups(groups: List[List[str]], words_fn, words_per_group, redundant_words_per_group: int) -> List[List[str]]:
    redundant_words = words_fn()
    random.seed(RANDOM_SEED)
    for i in range(0, len(groups) - 1):
        for j in range(redundant_words_per_group):
            candidate = random.choice(redundant_words)
            while candidate in groups[:words_per_group]:
                candidate = random.choice(redundant_words)
            if len(groups[i]) > words_per_group + j:
                groups[i][words_per_group+j] = candidate
            else:
                groups[i].append(candidate)

    i = len(groups) - 1
    words_in_group = len(groups[i])
    for j in range(words_per_group + redundant_words_per_group - words_in_group):
        candidate = random.choice(redundant_words)
        while candidate in groups[:words_per_group]:
            candidate = random.choice(redundant_words)
        groups[i].append(candidate)
    return groups

In [13]:
groups_of_words_with_extra = add_redundant_word_to_groups(groups_of_words, generate_unique_words, WORDS_PER_GROUP, REDUNDANT_WORDS_PER_GROUP)
groups_of_words_with_extra

[['sof', 'nuk', 'fon', 'mim', 'pas'],
 ['pom', 'kom', 'mep', 'kip', 'kon'],
 ['fot', 'paf', 'kaf', 'tok', 'fat'],
 ['fis', 'pam', 'kak', 'nef', 'tit'],
 ['kek', 'sos', 'maf', 'pan', 'fom'],
 ['fup', 'set', 'nit', 'fit', 'nep'],
 ['kik', 'fak', 'kep', 'tas', 'mot'],
 ['mas', 'mop', 'mun', 'sot', 'kan'],
 ['kut', 'nus', 'tup', 'nas', 'kam'],
 ['sif', 'saf', 'nos', 'fip', 'tif'],
 ['pup', 'sap', 'pus', 'nop', 'min'],
 ['sop', 'tum', 'mat', 'kem', 'kif'],
 ['pif', 'mek', 'tik', 'mof', 'nok'],
 ['pat', 'sik', 'muf', 'kus', 'kop'],
 ['tuk', 'fus', 'kes', 'pep', 'kef'],
 ['sum', 'mik', 'tet', 'met', 'tem'],
 ['nen', 'sek', 'tot', 'sep', 'nin'],
 ['faf', 'fas', 'nek', 'tam', 'sut'],
 ['sim', 'nes', 'sup', 'tus', 'fum'],
 ['puk', 'sun', 'fap', 'kun', 'kap'],
 ['kok', 'ken', 'ket', 'fef', 'nup'],
 ['fop', 'som', 'tuf', 'muk', 'mif'],
 ['kat', 'nak', 'sis', 'tim', 'mup'],
 ['pip', 'tom', 'pok', 'nam', 'fuk'],
 ['mip', 'nan', 'pun', 'tos', 'nif'],
 ['mef', 'mes', 'sin', 'men', 'pum'],
 ['sak', 'se

In [14]:
def print_groups(groups: List[List[str]]) -> str:
    words_string = ""
    for i in range(len(groups)):
        for j in range(len(groups[i])):
            words_string += f'{i}.{j}\t\t{groups[i][j]}\n'
        words_string += "\n"
    return words_string

In [15]:
def write_to_file(filename: str, words_string: str):
    with open(filename, 'w') as f:
        f.write(words_string)

In [17]:
print(print_groups(groups_of_words_with_extra))
write_to_file('words.txt', print_groups(groups_of_words_with_extra))

0.0		sof
0.1		nuk
0.2		fon
0.3		mim
0.4		pas

1.0		pom
1.1		kom
1.2		mep
1.3		kip
1.4		kon

2.0		fot
2.1		paf
2.2		kaf
2.3		tok
2.4		fat

3.0		fis
3.1		pam
3.2		kak
3.3		nef
3.4		tit

4.0		kek
4.1		sos
4.2		maf
4.3		pan
4.4		fom

5.0		fup
5.1		set
5.2		nit
5.3		fit
5.4		nep

6.0		kik
6.1		fak
6.2		kep
6.3		tas
6.4		mot

7.0		mas
7.1		mop
7.2		mun
7.3		sot
7.4		kan

8.0		kut
8.1		nus
8.2		tup
8.3		nas
8.4		kam

9.0		sif
9.1		saf
9.2		nos
9.3		fip
9.4		tif

10.0		pup
10.1		sap
10.2		pus
10.3		nop
10.4		min

11.0		sop
11.1		tum
11.2		mat
11.3		kem
11.4		kif

12.0		pif
12.1		mek
12.2		tik
12.3		mof
12.4		nok

13.0		pat
13.1		sik
13.2		muf
13.3		kus
13.4		kop

14.0		tuk
14.1		fus
14.2		kes
14.3		pep
14.4		kef

15.0		sum
15.1		mik
15.2		tet
15.3		met
15.4		tem

16.0		nen
16.1		sek
16.2		tot
16.3		sep
16.4		nin

17.0		faf
17.1		fas
17.2		nek
17.3		tam
17.4		sut

18.0		sim
18.1		nes
18.2		sup
18.3		tus
18.4		fum

19.0		puk
19.1		sun
19.2		fap
19.3		kun
19.4		kap

20.0		kok
20.1		ken
20.2		ket


In [20]:
def group_to_table(group_index: int, group: List[str]):
    return [[group_index, i, group[i]] for i in range(len(group))]

In [26]:
def merge_group_tables(groups: List[List[str]]):
    table = []
    for i in range(len(groups)):
        table.extend(group_to_table(i, groups[i]))
    return pd.DataFrame(table, columns=['Group', 'Word', 'Value'])


Unnamed: 0,Group,Word,Value
0,0,0,sof
1,0,1,nuk
2,0,2,fon
3,0,3,mim
4,0,4,pas
...,...,...,...
1220,244,0,tit
1221,244,1,fat
1222,244,2,kon
1223,244,3,pas


In [28]:
groups_df = merge_group_tables(groups_of_words_with_extra)
groups_df.to_csv('all_words.csv', index=False)

In [29]:
def calculate_levenshtein_distance(word_a: str, word_b: str) -> Tuple[int, List[int]]:
    if len(word_a) != len(word_b):
        raise ValueError('Words must be of the same length')

    number_of_different_letters = 0
    different_positions = []
    for i in range(len(word_a)):
        if word_a[i] != word_b[i]:
            number_of_different_letters += 1
            different_positions.append(i)

    return number_of_different_letters, different_positions

In [30]:
def find_minimal_pairs(words: List[str]) -> List[Tuple[Tuple[str, str], int]]:
    minimal_pairs: List[Tuple[Tuple[str, str], int]] = []
    for i in range(len(words)):
        for j in range(i+1, len(words)):
            levenshtein_distance, different_positions = calculate_levenshtein_distance(words[i], words[j])
            if levenshtein_distance == 1:
                minimal_pairs.append(((words[i], words[j]), different_positions[0]))
    return minimal_pairs

In [31]:
minimal_pairs = find_minimal_pairs(generate_unique_words())
len(minimal_pairs)

1960

In [32]:
minimal_pairs


[(('faf', 'fak'), 2),
 (('faf', 'fam'), 2),
 (('faf', 'fan'), 2),
 (('faf', 'fap'), 2),
 (('faf', 'fas'), 2),
 (('faf', 'fat'), 2),
 (('faf', 'fef'), 1),
 (('faf', 'fif'), 1),
 (('faf', 'fof'), 1),
 (('faf', 'fuf'), 1),
 (('faf', 'kaf'), 0),
 (('faf', 'maf'), 0),
 (('faf', 'naf'), 0),
 (('faf', 'paf'), 0),
 (('faf', 'saf'), 0),
 (('faf', 'taf'), 0),
 (('fak', 'fam'), 2),
 (('fak', 'fan'), 2),
 (('fak', 'fap'), 2),
 (('fak', 'fas'), 2),
 (('fak', 'fat'), 2),
 (('fak', 'fek'), 1),
 (('fak', 'fik'), 1),
 (('fak', 'fok'), 1),
 (('fak', 'fuk'), 1),
 (('fak', 'kak'), 0),
 (('fak', 'mak'), 0),
 (('fak', 'nak'), 0),
 (('fak', 'pak'), 0),
 (('fak', 'sak'), 0),
 (('fak', 'tak'), 0),
 (('fam', 'fan'), 2),
 (('fam', 'fap'), 2),
 (('fam', 'fas'), 2),
 (('fam', 'fat'), 2),
 (('fam', 'fem'), 1),
 (('fam', 'fim'), 1),
 (('fam', 'fom'), 1),
 (('fam', 'fum'), 1),
 (('fam', 'kam'), 0),
 (('fam', 'mam'), 0),
 (('fam', 'nam'), 0),
 (('fam', 'pam'), 0),
 (('fam', 'sam'), 0),
 (('fam', 'tam'), 0),
 (('fan', 