-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
levenshtein.py
153 lines (126 loc) · 5.23 KB
/
levenshtein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2018 Vit Novotny <witiko@mail.muni.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
This module provides a namespace for functions that use the Levenshtein distance.
"""
from itertools import islice
import logging
from math import floor
from gensim.similarities.termsim import TermSimilarityIndex
logger = logging.getLogger(__name__)
def levdist(t1, t2, max_distance=float("inf")):
"""Get the Levenshtein distance between two terms.
Return the Levenshtein distance between two terms. The distance is a
number between <1.0, inf>, higher is less similar.
Parameters
----------
t1 : {bytes, str, unicode}
The first compared term.
t2 : {bytes, str, unicode}
The second compared term.
max_distance : {int, float}, optional
If you don't care about distances larger than a known threshold, a more
efficient code path can be taken. For terms that are clearly "too far
apart", we will not compute the distance exactly, but we will return
`max(len(t1), len(t2))` more quickly, meaning "more than
`max_distance`".
Default: always compute distance exactly, no threshold clipping.
Returns
-------
int
The Levenshtein distance between `t1` and `t2`.
"""
import Levenshtein
distance = Levenshtein.distance(t1, t2)
if distance > max_distance:
return max(len(t1), len(t2))
return distance
def levsim(t1, t2, alpha=1.8, beta=5.0, min_similarity=0.0):
"""Get the Levenshtein similarity between two terms.
Return the Levenshtein similarity between two terms. The similarity is a
number between <0.0, 1.0>, higher is more similar.
Parameters
----------
t1 : {bytes, str, unicode}
The first compared term.
t2 : {bytes, str, unicode}
The second compared term.
alpha : float, optional
The multiplicative factor alpha defined by Charlet and Damnati (2017).
beta : float, optional
The exponential factor beta defined by Charlet and Damnati (2017).
min_similarity : {int, float}, optional
If you don't care about similarities smaller than a known threshold, a
more efficient code path can be taken. For terms that are clearly "too
far apart", we will not compute the distance exactly, but we will
return zero more quickly, meaning "less than `min_similarity`".
Default: always compute similarity exactly, no threshold clipping.
Returns
-------
float
The Levenshtein similarity between `t1` and `t2`.
Notes
-----
This notion of Levenshtein similarity was first defined in section 2.2 of
`Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3:
Soft-Cosine Semantic Similarity between Questions for Community Question
Answering", 2017 <http://www.aclweb.org/anthology/S/S17/S17-2051.pdf>`_.
"""
assert alpha >= 0
assert beta >= 0
max_lengths = max(len(t1), len(t2))
if max_lengths == 0:
return 1.0
min_similarity = float(max(min(min_similarity, 1.0), 0.0))
max_distance = int(floor(max_lengths * (1 - (min_similarity / alpha) ** (1 / beta))))
distance = levdist(t1, t2, max_distance)
similarity = alpha * (1 - distance * 1.0 / max_lengths)**beta
return similarity
class LevenshteinSimilarityIndex(TermSimilarityIndex):
"""
Computes Levenshtein similarities between terms and retrieves most similar
terms for a given term.
Notes
-----
This is a naive implementation that iteratively computes pointwise Levenshtein similarities
between individual terms. Using this implementation to compute the similarity of all terms in
real-world dictionaries such as the English Wikipedia will take years.
Parameters
----------
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
A dictionary that specifies the considered terms.
alpha : float, optional
The multiplicative factor alpha defined by Charlet and Damnati (2017).
beta : float, optional
The exponential factor beta defined by Charlet and Damnati (2017).
threshold : float, optional
Only terms more similar than `threshold` are considered when retrieving
the most similar terms for a given term.
See Also
--------
:func:`gensim.similarities.levenshtein.levsim`
The Levenshtein similarity.
:class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
Build a term similarity matrix and compute the Soft Cosine Measure.
"""
def __init__(self, dictionary, alpha=1.8, beta=5.0, threshold=0.0):
self.dictionary = dictionary
self.alpha = alpha
self.beta = beta
self.threshold = threshold
super(LevenshteinSimilarityIndex, self).__init__()
def most_similar(self, t1, topn=10):
similarities = (
(levsim(t1, t2, self.alpha, self.beta, self.threshold), t2)
for t2 in self.dictionary.values()
if t1 != t2
)
most_similar = (
(t2, similarity)
for (similarity, t2) in sorted(similarities, reverse=True)
if similarity > 0
)
return islice(most_similar, topn)