This notebook compares the accuracy between Fuzzy Matching and Roberta Transformer Semantic Matching in getting the correct head entity.



Before running, go to Runtime => Change Runtime type to GPU.


In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
%cd gdrive/MyDrive/LibKGE Test/kge/

/content/gdrive/.shortcut-targets-by-id/12lK9g6Ccl-njCvAuL28xCjV1XOyHp9bU/LibKGE Test/kge


In [None]:
! pip install transformers
! pip install path
! pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 3.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [None]:
import torch
from kge.model.kge_model import KgeModel
from kge.util.io import load_checkpoint

import spacy

from sentence_transformers import SentenceTransformer, util
import numpy as np

In [None]:
#generate a dictionary of all of the entities in our knowledge graph.

def generate_choices_dict(del_data_file_path):
  lines = open(del_data_file_path, 'r').readlines()
  count = 0
  choices = {}
# Strips the newline character
# Dictionary with lowercased choices mapped to original choices
  for line in lines:
    count += 1
    entity_name = line.split("\t")[1][:-1]
    if entity_name not in choices:
      choices[str.lower(entity_name)] = entity_name
    else:
      choices[str.lower(entity_name) + "."] = entity_name + "."

  return choices

In [None]:
#half of the knowledge graph
half_choices_dict = generate_choices_dict("data/fb_natural_language_data_half/entity_ids.del")
half_choices_list = list(half_choices_dict.values())

In [None]:
#full knowledge graph
full_choices_dict = generate_choices_dict("data/fb_natural_language_data_full/entity_ids.del")
full_choices_list = list(full_choices_dict.values())

Roberta Sentence Transformer for Semantic Similarity Matching (Method #1):

In [None]:
#run this the first time to generate the roberta embeddings, to save time, we already generated this embeddings that can be loaded in the next cell.
torch.save(robertamodel.encode(choices_list, convert_to_tensor=True), 'full_choices_embeddings.pt') 

In [None]:
#load in the roberta embeddings for all of the entities in our full knowledge graph
full_choices_embed = torch.load('full_choices_embeddings.pt', map_location=torch.device('cpu'))

In [None]:
#load in the roberta embeddings of all the entities in our half knowledge graph
choices_embed = torch.load('choices_embeddings.pt', map_location=torch.device('cpu'))
choices_embed

tensor([[-0.7838, -0.5647,  0.2499,  ..., -0.4816,  0.0774,  0.4452],
        [ 0.9299, -0.0585, -0.1883,  ..., -0.3562, -0.8165, -0.5405],
        [ 0.1881, -0.3611, -0.6133,  ..., -0.1892, -0.2793,  0.7399],
        ...,
        [ 0.9075, -1.7412, -0.0369,  ..., -0.9934,  0.4677, -0.2266],
        [-0.3334,  0.0546, -0.1445,  ..., -0.9971, -0.0110,  0.2426],
        [ 1.0035, -0.0526,  0.3036,  ...,  0.1418, -1.1209,  0.3736]])

In [None]:
#initialize the roberta model sentence transformer
robertamodel = SentenceTransformer('stsb-roberta-large')

In [None]:
#returns the closest k strings based on roberta similarity matching
def roberta_closest_strings(target_string, top_k, choices_list, choices_embeddings):
    string_embedding = robertamodel.encode(target_string , convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(string_embedding.to("cpu"), choices_embeddings)[0]
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
    closest_k_strings = dict()
    for idx in top_results[0:top_k]:
      closest_k_strings[choices_list[idx]] = cos_scores[idx].item()

    return closest_k_strings

In [None]:
#example usage
roberta_closest_strings("japon", 10, choices_list, full_choices_embed)

{'Birth of Japan': 0.7780697345733643,
 'JYP Nation': 0.8007750511169434,
 'Japan Center': 0.8152464628219604,
 'Japan Japan': 0.8804333806037903,
 'Japan Town': 0.8063554763793945,
 'Japan Yen': 0.8202947378158569,
 'Japanese Empire': 0.7857764363288879,
 'Japanimation': 0.8316668272018433,
 'Japón': 0.7905580997467041,
 'My Japan': 0.8157517910003662}

Fuzzy Matching for Pattern Similarity Matching (Method #2):

In [None]:
! pip install rapidfuzz
from rapidfuzz import process, fuzz

def fuzz_get_closest_strings(string, n):    
  return [choices[i[0]] for i in process.extract(string, choices.keys(), scorer=fuzz.partial_ratio, limit=n)]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rapidfuzz
  Downloading rapidfuzz-2.0.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 5.1 MB/s 
[?25hCollecting jarowinkler<1.1.0,>=1.0.2
  Downloading jarowinkler-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (103 kB)
[K     |████████████████████████████████| 103 kB 56.9 MB/s 
[?25hInstalling collected packages: jarowinkler, rapidfuzz
Successfully installed jarowinkler-1.0.2 rapidfuzz-2.0.11


In [None]:
#question: question you want to extract entities from
#k_closest_per_entity: the number of closest entities from the KG you want per extracted entity from question

import spacy
def get_head_entities_fuzz(question, k_closest_per_entity):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(question)
    head_entities = []
    for entity in doc.ents:
        head_entities.extend(fuzz_get_closest_strings(str(entity), k_closest_per_entity))
    return head_entities

def get_head_entities_roberta(question, k_closest_per_entity):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(question)
    head_entities = []
    for entity in doc.ents:
        head_entities.extend(roberta_closest_strings(str(entity), k_closest_per_entity, choices_list, full_choices_embed))
    return head_entities

def ner(question):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(question)
  #head_entities = []
  return doc.ents

In [None]:
#load in test data 
with open('qa_test_nl.txt') as f:  
  qa_test_lines = f.readlines()

In [None]:
#count the proportion where the correct head entity in the test data is in the list of k similar words returned by the roberta or fuzz closest strings.
def count_correct_roberta(qa_test_lines, top_k):
  correct_count = 0
  for line in qa_test_lines:
    question = line.split(' [')[0]
    correct_head_entity = line.split('[', 1)[1].split(']')[0]
    most_similar_words = get_head_entities_roberta(question, top_k)
    if correct_head_entity in most_similar_words:
      correct_count += 1
  return correct_count/len(qa_test_lines)

def count_correct_fuzz(qa_test_lines, top_k):
  correct_count = 0
  for line in qa_test_lines:
    question = line.split(' [')[0]
    correct_head_entity = line.split('[', 1)[1].split(']')[0]
    most_similar_words = get_head_entities_fuzz(question, top_k)
    if correct_head_entity in most_similar_words:
      correct_count += 1
  return correct_count/len(qa_test_lines)

In [None]:
count_correct_roberta(qa_test_lines, 10)

0.3493150684931507

In [None]:
count_correct_fuzz(qa_test_lines, 10)

0.363013698630137

The proportion of the head entity being in the list returned by fuzzy matching and roberta matching are similar from the result above.

Testing fuzz.ratio vs fuzz.partial_ratio for the fuzzy matching scoring parameter:

In [None]:
! pip install rapidfuzz
from rapidfuzz import process, fuzz

def fuzz_get_closest_strings(string, n):    
  return [choices[i[0]] for i in process.extract(string, choices.keys(), scorer=fuzz.ratio, limit=n)]

def fuzz_get_closest_strings_partial(string, n):    
  return [choices[i[0]] for i in process.extract(string, choices.keys(), scorer=fuzz.partial_ratio, limit=n)]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rapidfuzz
  Downloading rapidfuzz-2.0.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 4.9 MB/s 
[?25hCollecting jarowinkler<1.1.0,>=1.0.2
  Downloading jarowinkler-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (103 kB)
[K     |████████████████████████████████| 103 kB 74.5 MB/s 
[?25hInstalling collected packages: jarowinkler, rapidfuzz
Successfully installed jarowinkler-1.0.2 rapidfuzz-2.0.11


In [None]:
#question: question you want to extract entities from
#k_closest_per_entity: the number of closest entities from the KG you want per extracted entity from question

import spacy

def get_head_entities_fuzz(question, k_closest_per_entity):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(question)
    head_entities = []
    for entity in doc.ents:
        head_entities.extend(fuzz_get_closest_strings(str(entity), k_closest_per_entity))
    return head_entities

def get_head_entities_fuzz_partial(question, k_closest_per_entity):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(question)
    head_entities = []
    for entity in doc.ents:
        head_entities.extend(fuzz_get_closest_strings_partial(str(entity), k_closest_per_entity))
    return head_entities


def ner(question):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(question)
  #head_entities = []
  return doc.ents

In [None]:
#load in test data 
with open('qa_test_nl.txt') as f:  
  qa_test_lines = f.readlines()

In [None]:
def count_correct_fuzz(qa_test_lines, top_k):
  correct_count = 0
  for line in qa_test_lines:
    question = line.split(' [')[0]
    correct_head_entity = line.split('[', 1)[1].split(']')[0]
    most_similar_words = get_head_entities_fuzz(question, top_k)
    if correct_head_entity in most_similar_words:
      correct_count += 1
  return correct_count/len(qa_test_lines)

def count_correct_fuzz_partial(qa_test_lines, top_k):
  correct_count = 0
  for line in qa_test_lines:
    question = line.split(' [')[0]
    correct_head_entity = line.split('[', 1)[1].split(']')[0]
    most_similar_words = get_head_entities_fuzz_partial(question, top_k)
    if correct_head_entity in most_similar_words:
      correct_count += 1
  return correct_count/len(qa_test_lines)

In [None]:
count_correct_fuzz(qa_test_lines, 10)

0.22602739726027396

In [None]:
count_correct_fuzz_partial(qa_test_lines, 10)

0.363013698630137

The fuzz partial method performs better than fuzz ratio in getting the correct head entity.