In [1]:
from nltk.corpus import wordnet
for syn in wordnet.synsets('rain'):
    print(syn.name(), syn.definition())

rain.n.01 water falling in drops from vapor condensed in the atmosphere
rain.n.02 drops of fresh water that fall as precipitation from clouds
rain.n.03 anything happening rapidly or in quick successive
rain.v.01 precipitate as rain


In [3]:
import itertools

SYN_CACHE = {}


def word_similarity(word1, word2, function=max):
    """Calculate the maximum possible similarity between two words.
    
    Goes through the synsets of each word, and finds the pair that is most similar to each other.
    """
    try:
        synsets1 = SYN_CACHE[word1]
    except KeyError:
        synsets1 = SYN_CACHE[word1] = wordnet.synsets(word1)
    try:
        synsets2 = SYN_CACHE[word2]
    except KeyError:
        synsets2 = SYN_CACHE[word2] = wordnet.synsets(word2)
    desired_similarity = None
    name1 = name2 = None
    for syn1, syn2 in itertools.product(synsets1, synsets2):
        similarity = syn1.wup_similarity(syn2)
        if desired_similarity is None:
            keep = True
        elif similarity is not None and function(similarity, desired_similarity) == similarity:
            keep = True
        else:
            keep = False
        
        if keep:
            desired_similarity = similarity
            name1 = syn1.name()
            name2 = syn2.name()
    return 0 if desired_similarity is None else desired_similarity, name1, name2
    
sim, name1, name2 = word_similarity('rain', 'frost', function=min)
print(sim, ";", wordnet.synset(name1).definition(), ";", wordnet.synset(name2).definition())

0.1111111111111111 ; anything happening rapidly or in quick successive ; weather cold enough to cause freezing


In [4]:
%timeit word_similarity('demolish', 'demolition')

760 µs ± 58.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
# The Heisig method assigns keywords to Kanji, and groups Kanji into lessons of increasing difficulty.
import io
import csv
import gzip

def tweak(kanji):
    for key, value in kanji.items():
        if value.isdigit():
            kanji[key] = int(value)
    return dict(kanji)

with gzip.GzipFile('heisig-data.txt.gz') as fin:
    fin.readline()
    reader = csv.DictReader(io.StringIO(fin.read().decode('utf-8')), delimiter=':')
    heisig = [tweak(x) for x in reader]

In [8]:
heisig_keywords = [h['keyword4th-ed'] for h in heisig[:100]]
len(heisig_keywords)
heisig_sim = [(kw1, kw2, word_similarity(kw1, kw2))
              for (kw1, kw2) in itertools.combinations(heisig_keywords, 2)]
sorted(heisig_sim, reverse=True, key=lambda x: x[2][0])[:10]

[('sword', 'blade', (1.0, 'sword.n.01', 'sword.n.01')),
 ('round', 'circle', (1.0, 'circle.n.08', 'circle.n.08')),
 ('villain', 'part', (0.96, 'villain.n.02', 'character.n.04')),
 ('round', 'part', (0.9411764705882353, 'turn.n.09', 'part.n.09')),
 ('beginning', 'part', (0.9411764705882353, 'beginning.n.03', 'part.n.09')),
 ('part', 'cut', (0.9230769230769231, 'share.n.01', 'cut.n.01')),
 ('seven', 'nine', (0.9090909090909091, 'seven-spot.n.01', 'nine-spot.n.01')),
 ('seven',
  'eight',
  (0.9090909090909091, 'seven-spot.n.01', 'eight-spot.n.01')),
 ('seven', 'ten', (0.9090909090909091, 'seven-spot.n.01', 'ten-spot.n.01')),
 ('seven', 'five', (0.9090909090909091, 'seven-spot.n.01', 'five-spot.n.03'))]

In [9]:
sorted(heisig_sim, reverse=True, key=lambda x: x[2][0])[:100]

[('sword', 'blade', (1.0, 'sword.n.01', 'sword.n.01')),
 ('round', 'circle', (1.0, 'circle.n.08', 'circle.n.08')),
 ('villain', 'part', (0.96, 'villain.n.02', 'character.n.04')),
 ('round', 'part', (0.9411764705882353, 'turn.n.09', 'part.n.09')),
 ('beginning', 'part', (0.9411764705882353, 'beginning.n.03', 'part.n.09')),
 ('part', 'cut', (0.9230769230769231, 'share.n.01', 'cut.n.01')),
 ('seven', 'nine', (0.9090909090909091, 'seven-spot.n.01', 'nine-spot.n.01')),
 ('seven',
  'eight',
  (0.9090909090909091, 'seven-spot.n.01', 'eight-spot.n.01')),
 ('seven', 'ten', (0.9090909090909091, 'seven-spot.n.01', 'ten-spot.n.01')),
 ('seven', 'five', (0.9090909090909091, 'seven-spot.n.01', 'five-spot.n.03')),
 ('seven', 'six', (0.9090909090909091, 'seven-spot.n.01', 'six-spot.n.01')),
 ('nine', 'eight', (0.9090909090909091, 'nine-spot.n.01', 'eight-spot.n.01')),
 ('nine', 'ten', (0.9090909090909091, 'nine-spot.n.01', 'ten-spot.n.01')),
 ('nine', 'five', (0.9090909090909091, 'nine-spot.n.01', 'f