In [2]:
import requests
import json
import pickle
import editdistance
import re
import numpy as np 
from fastDamerauLevenshtein import damerauLevenshtein

Prior to measuring pairwise similarities across a list of entities, I would be excluding any entities that go by only one word count

In [3]:
entities_names = pickle.load(open("all_indivs.pickle", 'rb'))

In [7]:
# remove one word entity
# sub any non-alphanum-character w/ ""
entities_alphnum_only = ["".join(char.lower() for char in entity_name if char.isalnum() or char == " ") 
                         for entity_name in entities_names
                        if len(entity_name.split()) > 1]

In [4]:
len(entities_alphnum_only)

32169

In [43]:
def cal_edit_distance(words_1, words_2):
  """
  calculates the sum of minimum edit distances between words.

  Args:
      words_1: strings of word to be compared with
      words_2: strings of word to be compared with

  Returns:
    the value of the sum of minimum edit distances for each word comparison.
  """

  query_words = words_1.split()
  target_words = words_2.split()

  len_target = len(target_words)

  # Calculate sum of minimum edit distances for each word pair
  min_edit_distance_sum = 0
  for query_word in query_words:
    
    edit_distance_arr = np.array([editdistance.eval(query_word, w) for w in target_words], dtype=np.int16)
    
    min_idx = np.argmin(edit_distance_arr)
    target_words.pop(min_idx)
    min_edit_distance_sum += min(edit_distance_arr)
    if len(target_words) == 0:
        break

  return min_edit_distance_sum

In [44]:
def compare_strings_pairwise(string_list):
  """
  Compares strings in a list pairwise and calculates the sum of minimum edit distances between words.

  Args:
      string_list: A list of strings to be compared.

  Returns:
      A dictionary where the key is the target string and the value is the sum of minimum edit distances for each word comparison.
  """

  n_string = len(string_list)
  min_dist_result = np.zeros((n_string, n_string), dtype=np.int16)
  for i in range(n_string):
    for j in range(n_string):
        if i == j:
            min_dist_result[i, j] = 0
        else:
            min_dist_result[i, j] = cal_edit_distance(string_list[i], string_list[j])
  return min_dist_result

In [45]:
compare_strings_pairwise(entities_alphnum_only[1:5])

array([[ 0, 16, 14, 13],
       [15,  0, 10, 11],
       [14, 10,  0,  8],
       [19, 17, 16,  0]], dtype=int16)

In [47]:
all_edit_dist = compare_strings_pairwise(entities_alphnum_only)

In [48]:
all_edit_dist

array([[ 0, 14, 12, ...,  8, 13, 11],
       [14,  0, 16, ..., 16, 12, 13],
       [11, 15,  0, ..., 11, 12, 11],
       ...,
       [ 8, 16, 11, ...,  0, 11, 11],
       [10, 12, 12, ..., 11,  0,  9],
       [11, 15, 13, ..., 11,  9,  0]], dtype=int16)

In [49]:
#np.save("edit_distance_result",all_edit_dist.astype(int) )