# Edit Distance for Spelling Correction

In [1]:
# TODO: isi nama dan NPM Anda

nama = 'Mohamad Arvin Fadriansyah'
npm = '2106708311'

In [2]:
# Import library yang diperlukan
import random

from utils import load_json, format_report
from trie_structure.levenshtein_trie import LevenshteinTrie
from trie_structure.damerau_levenshtein_trie import DamerauLevenshteinTrie
from dict_structure.levenshtein_dict import LevenshteinDict
from dict_structure.damerau_levenshtein_dict import DamerauLevenshteinDict
from performance import Performance

# 1. Dataset

In [3]:
# TODO: Definisikan path dari dataset KBBI dan non word error (typo)

KBBI_PATH = 'bahasa-indonesia-dictionary.txt'
TYPO_PATH = 'saltik.json'

In [4]:
typo_dataset = load_json(TYPO_PATH)

# random sampling berdasarkan NPM
random.seed(int(npm))
sample_size = 5
typo_sample_keys = random.sample(list(typo_dataset.keys()), sample_size)
typo_sample_dataset = {key: typo_dataset[key] for key in typo_sample_keys}


# 2. Algoritma Edit Distance

In [5]:
# Trie
lev_trie = LevenshteinTrie(dict_path=KBBI_PATH)
dalev_trie = DamerauLevenshteinTrie(dict_path=KBBI_PATH)

# Dictionary
lev_dict = LevenshteinDict(dict_path=KBBI_PATH)
dalev_dict = DamerauLevenshteinDict(dict_path=KBBI_PATH)

# 3. Performa

In [6]:
"""
Ubah Data dari awalnya dictionary menjadi bentuk list 2 dimensi 
yang tiap row nya mengandung typo sample keys dan list kemungkinan non words error nya 
"""
typo_sample_dataset_modified = []
for key, typo_list in typo_sample_dataset.items():
    typos=[]
    for typo_data in typo_list:
        typos.append(typo_data['typo'])
    typo_sample_dataset_modified.append([key,typos])
typo_sample_dataset = typo_sample_dataset_modified
print(typo_sample_dataset)

[['garis', ['uaris', 'tyaris', 'rtgaris', 'agis', 'yris', 'qgris', 'tris', 'gis', 'yaris', 'qagris', 'gtaris', 'argis', 'tgris', 'rgis', 'ayris', 'gris', 'tgais', 'rgaris', 'agris', 'tgaris']], ['karyawan', ['qakryawan', 'uiaryawan', 'iryawan', 'uikaryawan', 'arkyawan', 'ikaryawan', 'qryawan', 'uaryawan', 'aryawan', 'kryawan', 'ikryawan', 'qkryawan', 'kiaryawan', 'rayawan', 'iaryawan', 'akryawan', 'qaryawan', 'ukaryawan', 'airyawan', 'ryawan']], ['evolusi', ['qwvolusi', 'fvolusi', 'olusi', 'wvolusi', 'volusi', 'veolusi', 'eolusi', 'vwolusi', 'voelusi', 'qvolusi', 'wolusi', 'fveolusi', 'feolusi', 'weolusi', 'wevolusi', 'qevolusi', 'folusi', 'qwevolusi', 'ovlusi', 'ewvolusi']], ['ibadah', ['uadah', 'uibadah', 'iadah', 'iubadah', 'yubadah', 'biadah', 'abdah', 'giadah', 'buadah', 'hadah', 'bdah', 'gbiadah', 'yuibadah', 'badah', 'uiadah', 'baidah', 'ybadah', 'yibadah', 'gbadah', 'ubadah']], ['departemen', ['epartemen', 'eeartemen', 'wpartemen', 'deepartemen', 'edepartemen', 'wedepartemen', 

In [7]:
# Masukkan tiap model dan typo_sample_dataset ke performance
performance_of_lev_trie = Performance(lev_trie, typo_sample_dataset)
performance_of_dalev_trie = Performance(dalev_trie, typo_sample_dataset)
performance_of_lev_dict = Performance(lev_dict, typo_sample_dataset)
performance_of_dalev_dict = Performance(dalev_dict, typo_sample_dataset)


In [8]:
# Dictionary untuk formatting
performances = {"lev_trie": performance_of_lev_trie.calculate_performance(),
                "dalev_trie": performance_of_dalev_trie.calculate_performance(),
                "lev_dict": performance_of_lev_dict.calculate_performance(),
                "dalev_dict" :performance_of_dalev_dict.calculate_performance() 
                }

In [9]:
print(format_report(nama, npm,performances))

Mohamad Arvin Fadriansyah - 2106708311
---------- lev_trie ----------
Best Accuracy: 0.42
Candidate Accuracy: 0.99
Time: 8.76181435585022 seconds
---------- dalev_trie ----------
Best Accuracy: 0.43
Candidate Accuracy: 0.99
Time: 11.338266849517822 seconds
---------- lev_dict ----------
Best Accuracy: 0.42
Candidate Accuracy: 0.99
Time: 10.866144180297852 seconds
---------- dalev_dict ----------
Best Accuracy: 0.43
Candidate Accuracy: 0.99
Time: 120.18550753593445 seconds

