# 0. Initialisation

## 0.1 Imports

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import chardet
import editdistance
import jellyfish
import re
from tqdm import tqdm
import os

DATA_PATH = r"C:\Users\Louis\Documents\University\Masters\A23\NLP\Devoirs\data\hw2"
#DATA_PATH = r"C:\Users\barka\Desktop\NLP"

## 0.2 Functions

In [2]:
def detect_encoding(file_path):

    with open(file_path, 'rb') as file:
        rawdata = file.read()
    result = chardet.detect(rawdata)
    return result['encoding']

def get_word_counter(file_path, encoding):
    word_counter = Counter()

    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            
            parts = line.split()
            if len(parts) >= 2:
                word = parts[1]
                
                word_counter[word.strip()] += 1

    # Return the Counter containing word frequencies
    return word_counter


## 0.3 Check data quality
Make sure Python is reading the lines correctly. And it does!

In [3]:
# Check ypos
typos_file = open(DATA_PATH + r"\typo-0.2.txt")

for i, row in enumerate(typos_file):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 1000. Correct number? {i+1==1000}")

# Check vocabulary

encoding = detect_encoding(DATA_PATH + r"\voc-1bwc.txt") #check encoding
print(f"Encoding : {encoding}")

voc = open(DATA_PATH + r"\voc-1bwc.txt", encoding=encoding)

for i, row in enumerate(voc):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 201 315. Correct number? {i+1==201315}")

There are 1000 rows in the file. There should be 1000. Correct number? True


Encoding : utf-8
There are 201315 rows in the file. There should be 201 315. Correct number? True


How many words in the vocabulary?

In [4]:
vocab = get_word_counter(DATA_PATH + r"\voc-1bwc.txt", encoding)
print(len(vocab)) # just to verify

201315


# 1. Implementing various distances

## 1.1 Distance functions

### 1.1.a - Edit distance 
Taken from the blog, edited to return any number of likely words. 

In [5]:
def P(word, N=sum(vocab.values())): 
    "Probability of `word`."
    return vocab[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    huh = candidates(word)
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in vocab)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### 1.1.b -  Jaro distance

In [6]:
def jaro_correction(word):
    
    max_distance = float('-inf')
    max_word = ""
    
    for w in vocab :
        
        distance = jellyfish.jaro_similarity(word, w)
        
        if max_distance < distance :
            
            max_distance = distance
            max_word = w
    
    return max_word, max_distance

### 1.1.c -  Jaro-Winkler distance

In [7]:
def jaro_winkler_correction(word):
    max_distance = float('-inf')
    max_word = ""
    
    for w in vocab :
        
        distance = jellyfish.jaro_winkler_similarity(word, w)
        
        if max_distance < distance :
            
            max_distance = distance
            max_word = w
    
    return max_word, max_distance

### 1.1.d Generic Distance function

Works with any function that takes two words as an input. 

In [8]:
'''This function applies a generic function over an entire vocabulary to compare the distance between a given word
and every word inside the vocabulary. It then returns the n_neighbors most similar words.

Parameters:
    word: The word to find neighbors to.
    vocabulary: A list (or list-like) of the vocabulary.
    func: The distance function to apply without arguments (without parentheses).
    minimum: Do we want the minimum distance? Boolean. True by default, will yield the minimum distance. If False, the function will yield the maximum distance. 
    n_neighbors: The number of most similar words to return.

Returns:
    vocabulary.head: A dataframe containing the n_neighbors most similar words to the input word, with the distances. 
'''
def generic_distance_correction(word, vocabulary, func, minimum = True, n_neighbors=1):
    vocabulary = pd.DataFrame(vocabulary, columns=["words"])

    def calculateDistance(series_word):
        return func(series_word, word)
    
    distances = vocabulary["words"].apply(calculateDistance)
    
    vocabulary["distance"] = distances

    if minimum:
        vocabulary = vocabulary.sort_values("distance", ascending=True)
    else: 
        vocabulary = vocabulary.sort_values("distance", ascending=False)

    return vocabulary.head(n_neighbors)

## 1.2. Testing the correction methods

In [9]:
generic_distance_correction('speling', vocab.keys(), editdistance.eval, True, 5) #Edit distance

Unnamed: 0,words,distance
177635,spewing,1
189762,spelling,1
171215,sperling,1
174478,smelling,2
185401,spilling,2


In [10]:
generic_distance_correction('speling', vocab.keys(), jellyfish.jaro_similarity, False, 5)

Unnamed: 0,words,distance
171215,sperling,0.958333
189762,spelling,0.958333
167278,spellings,0.925926
196330,sleeping,0.910714
126688,sapling,0.904762


In [11]:
generic_distance_correction('speling', vocab.keys(), jellyfish.jaro_winkler_similarity, False, 5)

Unnamed: 0,words,distance
189762,spelling,0.975
171215,sperling,0.970833
167278,spellings,0.955556
177635,spewing,0.933333
196330,sleeping,0.919643


In [12]:
jaro_correction('speling') #Jaro distance

('sperling', 0.9583333333333334)

In [13]:
jaro_winkler_correction('speling') #Jaro-Winkler distance

('spelling', 0.975)

## 1.3. Running the correction methods on the data

### 1.3.1 - Get the typos:

In [102]:
with open(DATA_PATH + r"\typo-0.2.txt", "r", encoding=encoding) as file:
    text = file.read()

# Regex
typo_pattern = r'<typo orig="([^"]+)">([^<]+)</typo>'
typos = re.findall(typo_pattern, text)
typos = dict(typos)
typo_df = pd.DataFrame()
typo_df["Word"] = typos.keys()
typo_df["Typo"] = typos.values()

### 1.3.2 - Apply correction methods: 


In [108]:
def get_correction_df(path, typos, func, minimum = True, n_neighbors=1):

    if os.path.isfile(path):
        correction_df = pd.read_csv(path, index_col = 0)

    else: 
        correction_df = pd.DataFrame()

        for i, typo in tqdm(enumerate(typos.values()), desc="Correcting Typos", total=len(typos)):
            corrections = generic_distance_correction(typo, vocab.keys(), func, minimum, n_neighbors)  # Jaro distance
            row = corrections["words"].reset_index(drop=True)
            row.name = i
            correction_df = pd.concat([correction_df, row], axis=1)
        correction_df = correction_df.transpose()

        correction_df.to_csv(path)
    
    return correction_df


**Jaro**

In [113]:
jaro_correction_df = get_correction_df(DATA_PATH + "\jaro_correction_df.csv", typos, jellyfish.jaro_similarity, False, 5)

jaro_typo_df = pd.concat([typo_df, jaro_correction_df], axis=1)
jaro_typo_df

Unnamed: 0,Word,Typo,0,1,2,3,4
0,wealthy,wealtohy,wealthy,wealth,welty,healthy,watley
1,afford,aford,alford,afford,axford,ford,watford
2,Catholic,CatholiaCtholic,athol,palaeolithic,pathological,alghaithi,toit
3,cousins,coxusins,cousins,cousin,compulsions,coursing,coxswain
4,masks,mmasks,masks,mask,mass,asks,unmasks
...,...,...,...,...,...,...,...
1596,plummeted,plummete,plummeted,plummet,plummetted,plummets,lumme
1597,posts,psts,pests,posts,pasts,pst,psst
1598,defy,deefy,defy,deep-fry,beefy,dewey,dee
1599,translation,translatmion,translation,translations,translational,translating,transaction


**Jaro-Winkler**

In [124]:
jw_correction_df = get_correction_df(DATA_PATH + "\jw_correction_df.csv", typos, jellyfish.jaro_winkler_similarity, False, 5)

jw_typo_df = pd.concat([typo_df, jw_correction_df], axis=1)
jw_typo_df

Correcting Typos:  12%|█▏        | 196/1601 [00:47<05:54,  3.97it/s]

**Edit Distance**

In [49]:
ed_correction_df = get_correction_df(DATA_PATH + "\ed_correction_df.csv", typos, editdistance.eval, True, 5)

ed_typo_df = pd.concat([typo_df, ed_correction_df], axis=1)
ed_typo_df

Unnamed: 0,Word,Typo,0,1,2,3,4,Typo.1
0,wealthy,wealtohy,wealthy,wealth,weal,wealthtv,welty,wealtohy
1,afford,aford,afford,axford,alford,ford,affords,aford
2,Catholic,CatholiaCtholic,athol,palaeolithic,pathological,alghaithi,toit,CatholiaCtholic
3,cousins,coxusins,cousins,cousin,coxswain,compulsions,coun,coxusins
4,masks,mmasks,masks,mask,mass,asks,mma,mmasks
...,...,...,...,...,...,...,...,...
1596,plummeted,plummete,plummeted,plummet,plummetted,plummets,plume,plummete
1597,posts,psts,pst,pests,pasts,posts,psst,psts
1598,defy,deefy,defy,deep-fry,deery,dee,deeny,deefy
1599,translation,translatmion,translation,translations,translational,translating,transaction,translatmion


## 1.4 Return the text file with the corrections 

### 1.4.1 Functions

In [120]:
def format_correction(typo_row):
    name = typo_row["Word"]
    typo = typo_row['Typo']
    neigh0 = typo_row[0]
    neigh1 = typo_row[1]
    neigh2 = typo_row[2]
    neigh3 = typo_row[3]
    neigh4 = typo_row[4]
    return f"<correction orig=\"{name}\" typo=\"{typo}\">{neigh0} {neigh1} {neigh2} {neigh3} {neigh4}</correction>"

def format_typo(typo_row):
    orig = typo_row["Word"]
    typo = typo_row['Typo']
    return f'<typo orig="{orig}">{typo}</typo>'

def replace_typos(path, typo_df, typos_str): 
 
    formatted_corr = typo_df.apply(format_correction, axis=1)
    formatted_typo = typo_df.apply(format_typo, axis=1)

    formatted_dict = dict(zip(formatted_typo, formatted_corr))

    # Erase the contents of the file if it already exists
    if os.path.isfile(path):
        os.remove(path)

    for typo_pattern in formatted_dict.keys():
        correction_pattern = formatted_dict[typo_pattern]
        typos_str = re.sub(typo_pattern, correction_pattern, typos_str)

    # Write it to a .txt file
    with open(path, "a") as out_file:
        out_file.write(typos_str)

### 1.4.2 Replace typos for all distances

In [121]:
typos_file = open(DATA_PATH + r"\typo-0.2.txt").read()

**Jaro**

In [122]:
replace_typos(DATA_PATH + "\jaro_corrections-0.2.txt", jaro_typo_df, typos_file)

**Jaro-Winkler**

In [None]:
replace_typos(DATA_PATH + "\jw_corrections-0.2.txt", jw_typo_df, typos_file)

**Edit Distance**

In [None]:
replace_typos(DATA_PATH + "\ed_corrections-0.2.txt", ed_typo_df, typos_file)