# 0. Imports

In [63]:
import pandas as pd
import numpy as np
from collections import Counter
import chardet
import codecs
import jellyfish
import re
from tqdm import tqdm

#DATA_PATH = r"C:\Users\Louis\Documents\University\Masters\A23\NLP\Devoirs\data\hw2"
DATA_PATH = r"C:\Users\barka\Desktop\NLP"

# 1. Get the data
Make sure Python is reading the lines correctly. And it does!

In [43]:
typos_file = open(DATA_PATH + r"\typo-0.2.txt")

for i, row in enumerate(typos_file):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 1000. Correct number? {i+1==1000}")

There are 1000 rows in the file. There should be 1000. Correct number? True


In [12]:
def detect_encoding(file_path):
    
    with open(file_path, 'rb') as file:
        rawdata = file.read()
    result = chardet.detect(rawdata)
    return result['encoding']

encoding = detect_encoding(DATA_PATH + r"\voc-1bwc.txt")
print(f"Encoding : {encoding}")

Encoding : utf-8


In [9]:
voc = open(DATA_PATH + r"\voc-1bwc.txt", encoding=encoding)

for i, row in enumerate(voc):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 201 315. Correct number? {i+1==201315}")

There are 201315 rows in the file. There should be 201 315. Correct number? True


In [16]:
def get_word_counter(file_path, encoding):
    
    
    word_counter = Counter()

    
    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            
            parts = line.split()
            if len(parts) >= 2:
                word = parts[1]
                
                word_counter[word.strip()] += 1

    # Return the Counter containing word frequencies
    return word_counter

vocab = get_word_counter(DATA_PATH + r"\voc-1bwc.txt", encoding)
print(len(vocab)) # just to verify

201315


# 2. implementing various distances

### a. Edit distance (Distance d’edition) taken from the blog :

In [18]:
def P(word, N=sum(vocab.values())): 
    "Probability of `word`."
    return vocab[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in vocab)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### b. Jaro distance :

In [23]:
def jaro_correction(word):
    
    max_distance = float('-inf')
    max_word = ""
    
    for w in vocab :
        
        distance = jellyfish.jaro_distance(word, w)
        
        if max_distance < distance :
            
            max_distance = distance
            max_word = w
    
    return max_word, max_distance

### c. Jaro-Winkler distance

In [24]:
def jaro_winkler_correction(word):
    
    max_distance = float('-inf')
    max_word = ""
    
    for w in vocab :
        
        distance = jellyfish.jaro_winkler(word, w)
        
        if max_distance < distance :
            
            max_distance = distance
            max_word = w
    
    return max_word, max_distance

## 3. Testing the correction methods on the same word :

In [21]:
correction('speling') #Edit distance

'spelling'

In [25]:
jaro_correction('speling') #Jaro distance

('sperling', 0.9583333333333334)

In [26]:
jaro_winkler_correction('speling') #Jaro-Winkler distance

('spelling', 0.975)

## 4. Running the correction methods on the data

### Get the typos (test):  

In [39]:
test_text = """
Doesn 't this just hit the very <typo orig="wealthy">wealtohy</typo> who can <typo orig="afford">aford</typo> it ?
Also , <typo orig="Catholic">CatholiaCtholic</typo> school girl outfits .
He wrestled his <typo orig="cousins">coxusins</typo> wearing funny <typo orig="masks">mmasks</typo> worn by the locals .
Chrysler is in the process of seeking approval from a U.S. Bankruptcy Court judge to sell itself <typo orig="to">tos</typo> a new entity jointly owned <typo orig="by">b</typo> Fiat , the United Auto <typo orig="Workers">rokers</typo> and <typo orig="the">tthe</typo> U.S. and Canadian <typo orig="governments">government</typo> .
"""

typo_pattern = r'<typo orig="([^"]+)">([^<]+)</typo>'


typos = re.findall(typo_pattern, text)

typos = dict(typos)


for orig, typo in typos.items():
    #print("Original:", orig)
    #print("Typo:", typo)
    
    pass

### Get the typos (data):

In [44]:
with open(DATA_PATH + r"\typo-0.2.txt", "r", encoding=encoding) as file:
    text = file.read()

In [45]:
typos = re.findall(typo_pattern, text)

typos = dict(typos)


for orig, typo in typos.items():
    #print("Original:", orig)
    #print("Typo:", typo)
    
    pass

In [57]:
list(typos.items())[:1][-1]

('wealthy', 'wealtohy')

###  Apply correction methods: 

In [64]:
corrected_typos = dict()

for orig, typo in tqdm(typos.items(), desc="Correcting Typos"):
    
    correction = jaro_winkler_correction(typo)[0]  # Jaro-Winkler distance
    corrected_typos[orig] = correction

Correcting Typos: 100%|████████████████████████████████████████████████████████████| 1601/1601 [14:18<00:00,  1.87it/s]


In [65]:
print(f" Number of corrected typos : {len(corrected_typos)}")
print(f" Number of typos : {len(typos)}")

 Number of corrected typos : 1601
 Number of typos : 1601


### Return the text file with the corrections : 

In [66]:
#TODO (if u want and u have time u can write otherwise i will do it later at night)