# 0. Initialisation

## 0.1 Imports

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import chardet
import codecs
import jellyfish
import re
from tqdm import tqdm

DATA_PATH = r"C:\Users\Louis\Documents\University\Masters\A23\NLP\Devoirs\data\hw2"
#DATA_PATH = r"C:\Users\barka\Desktop\NLP"

## 0.2 Functions

In [4]:
def detect_encoding(file_path):

    with open(file_path, 'rb') as file:
        rawdata = file.read()
    result = chardet.detect(rawdata)
    return result['encoding']

def get_word_counter(file_path, encoding):
    word_counter = Counter()

    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            
            parts = line.split()
            if len(parts) >= 2:
                word = parts[1]
                
                word_counter[word.strip()] += 1

    # Return the Counter containing word frequencies
    return word_counter


## 0.3 Check data quality
Make sure Python is reading the lines correctly. And it does!

In [8]:
# Check ypos
typos_file = open(DATA_PATH + r"\typo-0.2.txt")

for i, row in enumerate(typos_file):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 1000. Correct number? {i+1==1000}")

# Check vocabulary

encoding = detect_encoding(DATA_PATH + r"\voc-1bwc.txt") #check encoding
print(f"Encoding : {encoding}")

voc = open(DATA_PATH + r"\voc-1bwc.txt", encoding=encoding)

for i, row in enumerate(voc):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 201 315. Correct number? {i+1==201315}")

There are 1000 rows in the file. There should be 1000. Correct number? True
Encoding : utf-8
There are 201315 rows in the file. There should be 201 315. Correct number? True


How many words in the vocabulary?

In [9]:
vocab = get_word_counter(DATA_PATH + r"\voc-1bwc.txt", encoding)
print(len(vocab)) # just to verify

201315


# 1. Implementing various distances

## 1.1 Distancc functions

### 1.1.a - Edit distance 
Taken from the blog. 

In [11]:
def P(word, N=sum(vocab.values())): 
    "Probability of `word`."
    return vocab[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in vocab)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### 1.1.b -  Jaro distance

In [18]:
def jaro_correction(word):
    max_distance = float('-inf')
    max_word = ""
    
    for w in vocab :
        
        distance = jellyfish.jaro_similarity(word, w)
        
        if max_distance < distance :
            
            max_distance = distance
            max_word = w
    
    return max_word, max_distance

### 1.1.c -  Jaro-Winkler distance

In [19]:
def jaro_winkler_correction(word):
    max_distance = float('-inf')
    max_word = ""
    
    for w in vocab :
        
        distance = jellyfish.jaro_winkler_similarity(word, w)
        
        if max_distance < distance :
            
            max_distance = distance
            max_word = w
    
    return max_word, max_distance

## 1.2. Testing the correction methods

In [27]:
correction('speling') #Edit distance

'spelling'

In [28]:
jaro_correction('speling') #Jaro distance

('sperling', 0.9583333333333334)

In [29]:
jaro_winkler_correction('speling') #Jaro-Winkler distance

('spelling', 0.975)

## 1.3. Running the correction methods on the data

### 1.3.1 - Get the typos:

In [45]:
with open(DATA_PATH + r"\typo-0.2.txt", "r", encoding=encoding) as file:
    text = file.read()

# Regex
typo_pattern = r'<typo orig="([^"]+)">([^<]+)</typo>'
typos = re.findall(typo_pattern, text)
typos = dict(typos)

### 1.3.2 - Apply correction methods: 

In [46]:
corrected_typos = dict()

for orig, typo in tqdm(typos.items(), desc="Correcting Typos"):
    correction = jaro_winkler_correction(typo)[0]  # Jaro-Winkler distance
    corrected_typos[orig] = correction

Correcting Typos:  18%|█▊        | 295/1601 [00:38<02:21,  9.20it/s]

In [None]:
print(f" Number of corrected typos : {len(corrected_typos)}")
print(f" Number of typos : {len(typos)}")

 Number of corrected typos : 1601
 Number of typos : 1601


### Return the text file with the corrections : 

In [66]:
#TODO (if u want and u have time u can write otherwise i will do it later at night)