# 0. Initialisation

## 0.1 Imports

In [28]:
import pandas as pd
import numpy as np
from collections import Counter
import chardet
import editdistance #
import jellyfish
import re
from tqdm import tqdm
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

DATA_PATH = r"C:\Users\Louis\Documents\University\Masters\A23\NLP\Devoirs\data\hw2"
#DATA_PATH = r"C:\Users\barka\Desktop\NLP"

## 0.2 Functions

In [29]:
def detect_encoding(file_path):

    with open(file_path, 'rb') as file:
        rawdata = file.read()
    result = chardet.detect(rawdata)
    return result['encoding']

def get_word_counter(file_path, encoding):
    word_counter = Counter()

    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            parts = line.split()
            if len(parts) >= 2:
                word = parts[1]
                count = int(parts[0]) 
                word_counter[word.strip()] = count

    return word_counter


def build_unigram_model(word_counter):
    unigram_model = {}

    total_words = sum(word_counter.values())

    for word, count in word_counter.items():
        probability = count / total_words

        unigram_model[word] = probability

    return unigram_model

## 0.3 Check data quality
Make sure Python is reading the lines correctly. And it does!

In [30]:
# Check ypos
typos_file = open(DATA_PATH + r"\typo-0.2.txt")

for i, row in enumerate(typos_file):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 1000. Correct number? {i+1==1000}")

# Check vocabulary

encoding = detect_encoding(DATA_PATH + r"\voc-1bwc.txt") #check encoding
print(f"Encoding : {encoding}")

voc = open(DATA_PATH + r"\voc-1bwc.txt", encoding=encoding)

for i, row in enumerate(voc):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 201 315. Correct number? {i+1==201315}")

There are 1000 rows in the file. There should be 1000. Correct number? True
Encoding : utf-8
There are 201315 rows in the file. There should be 201 315. Correct number? True


How many words in the vocabulary?

In [31]:
vocab = get_word_counter(DATA_PATH + r"\voc-1bwc.txt", encoding)
print(len(vocab)) # just to verify

201315


In [32]:
vocab["the"]

41223601

In [33]:
unigram_model = build_unigram_model(vocab)

In [34]:
unigram_model["the"]

0.07755452743807591

# 1. Implementing various distances

### 1.1 Generic Distance function

Works with any function that takes two words as an input. 

In [35]:
'''This function applies a generic function over an entire vocabulary to compare the distance between a given word
and every word inside the vocabulary. It then returns the n_neighbors most similar words.

Parameters:
    word: The word to find neighbors to.
    vocabulary: A list (or list-like) of the vocabulary.
    func: The distance function to apply without arguments (without parentheses).
    minimum: Do we want the minimum distance? Boolean. True by default, will yield the minimum distance. If False, the function will yield the maximum distance. 
    n_neighbors: The number of most similar words to return.

Returns:
    vocabulary.head: A dataframe containing the n_neighbors most similar words to the input word, with the distances. 
'''
def generic_distance_correction(word, vocabulary, func, minimum = True, n_neighbors=1):
    vocabulary = pd.DataFrame(vocabulary, columns=["words"])

    def calculateDistance(series_word):
        return func(series_word, word)
    
    distances = vocabulary["words"].apply(calculateDistance)
    
    vocabulary["distance"] = distances

    vocabulary = vocabulary.sort_values("distance", ascending=minimum)

    return vocabulary.head(n_neighbors)

### Weighted Functions

In [96]:
def edDistance_unigram_correction(word, vocabulary, n_neighbors=1):
    vocabulary = pd.DataFrame(vocabulary, columns=["words"])

    def calculateDistance(series_word):
        return editdistance.eval(series_word, word)
    
    def unigram_weighing(word):
        return unigram_model[word]

    distances = vocabulary["words"].apply(calculateDistance)
    vocabulary["distance"] = distances
    vocabulary["weight"] = vocabulary["words"].apply(unigram_weighing)

    vocabulary = vocabulary.sort_values(["distance", "weight"], ascending=[True, False])

    return vocabulary.head(n_neighbors)

def edDistance_soundex_correction(word, vocabulary, n_neighbors=1):
    vocabulary = pd.DataFrame(vocabulary, columns=["words"])

    def calculateDistance(series_word):
        return editdistance.eval(series_word, word)
    
    distances = vocabulary["words"].apply(calculateDistance)
    vocabulary["distance"] = distances
    vocabulary = vocabulary.sort_values("distance", ascending=True)
    
    # Then, out of the head, sort by soundex
    top_words = vocabulary.head(n_neighbors)
    soundex = []
    word_sdx = jellyfish.soundex(word)
    for word in top_words["words"]:
        corr_sdx = jellyfish.soundex(word)
        sdx_ed_distance = editdistance.eval(word_sdx, corr_sdx)
        soundex.append(sdx_ed_distance)
    top_words["soundex"] = soundex
    top_words = top_words.sort_values(["distance", "soundex"], ascending=[True, True])

    return top_words

In [119]:
def numeric_unigram_correction(word, vocabulary, func, n_neighbors=1):
    vocabulary = pd.DataFrame(vocabulary, columns=["words"])

    def calculateDistance(series_word):
        return func(series_word, word)
    
    def unigram_weighing(word):
        return unigram_model[word]

    distances = vocabulary["words"].apply(calculateDistance)
    vocabulary["distance"] = distances

    # Sort the values using the weight and only keep the head
    top_words = vocabulary.sort_values("distance", ascending=False).head(n_neighbors)

    # Make the weights relative to each other, using only the top words
    top_words["weight"] = top_words["words"].apply(unigram_weighing)
    top_words["weight"] = top_words["weight"] / top_words["weight"].sum()

    # Weight the distance
    top_words["wgt_distance"] = top_words["distance"] * top_words["weight"]

    # Sort the top words using the weighted distance
    top_words = top_words.sort_values("wgt_distance", ascending=False)

    return top_words

def numeric_soundex_correction(word, vocabulary, func, n_neighbors=1):
    vocabulary = pd.DataFrame(vocabulary, columns=["words"])

    def calculateDistance(series_word):
        return func(series_word, word)
    
    distances = vocabulary["words"].apply(calculateDistance)
    vocabulary["distance"] = distances
    vocabulary = vocabulary.sort_values("distance", ascending=False)
    
    # Then, out of the head, sort by soundex
    top_words = vocabulary.head(n_neighbors)
    soundex = []
    word_sdx = jellyfish.soundex(word)
    for word in top_words["words"]:
        corr_sdx = jellyfish.soundex(word)
        sdx_ed_distance = editdistance.eval(word_sdx, corr_sdx)
        soundex.append(sdx_ed_distance)
    top_words["soundex"] = 1 - soundex / np.sum(soundex)
    top_words["wgt_distance"] = top_words["distance"] * top_words["soundex"]
    
    top_words = top_words.sort_values("wgt_distance", ascending=False)

    return top_words

### 1.2. Testing the correction methods

In [106]:
generic_distance_correction('speling', vocab.keys(), editdistance.eval, True, 5) #Edit distance

Unnamed: 0,words,distance
177635,spewing,1
189762,spelling,1
171215,sperling,1
174478,smelling,2
185401,spilling,2


In [107]:
edDistance_unigram_correction("speling", vocab.keys(), 5)

Unnamed: 0,words,distance,weight
189762,spelling,1,6e-06
177635,spewing,1,2e-06
171215,sperling,1,1e-06
200773,spending,2,0.000248
200437,opening,2,0.000163


In [108]:
edDistance_soundex_correction("speling", vocab.keys(), 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["soundex"] = soundex


Unnamed: 0,words,distance,soundex
189762,spelling,1,0
177635,spewing,1,2
171215,sperling,1,2
185401,spilling,2,0
174478,smelling,2,1


In [109]:
generic_distance_correction('speling', vocab.keys(), jellyfish.jaro_similarity, False, 5)

Unnamed: 0,words,distance
171215,sperling,0.958333
189762,spelling,0.958333
167278,spellings,0.925926
196330,sleeping,0.910714
126688,sapling,0.904762


In [117]:
numeric_unigram_correction("speling", vocab.keys(), jellyfish.jaro_similarity, 5)

Unnamed: 0,words,distance,weight,wgt_distance
196330,sleeping,0.910714,0.7191,0.654894
189762,spelling,0.958333,0.203322,0.19485
171215,sperling,0.958333,0.039474,0.037829
167278,spellings,0.925926,0.031436,0.029107
126688,sapling,0.904762,0.006668,0.006033


In [118]:
numeric_soundex_correction("speling", vocab.keys(), jellyfish.jaro_similarity, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["soundex"] = 1 - soundex / np.sum(soundex)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["wgt_distance"] = top_words["distance"] * top_words["soundex"]


Unnamed: 0,words,distance,soundex,wgt_distance
189762,spelling,0.958333,1.0,0.958333
167278,spellings,0.925926,1.0,0.925926
126688,sapling,0.904762,1.0,0.904762
171215,sperling,0.958333,0.5,0.479167
196330,sleeping,0.910714,0.5,0.455357


In [120]:
generic_distance_correction('speling', vocab.keys(), jellyfish.jaro_winkler_similarity, False, 5)

Unnamed: 0,words,distance
189762,spelling,0.975
171215,sperling,0.970833
167278,spellings,0.955556
177635,spewing,0.933333
196330,sleeping,0.919643


In [121]:
numeric_unigram_correction("speling", vocab.keys(), jellyfish.jaro_winkler_similarity, 5)

Unnamed: 0,words,distance,weight,wgt_distance
196330,sleeping,0.919643,0.682026,0.62722
189762,spelling,0.975,0.19284,0.188019
177635,spewing,0.933333,0.05788,0.054021
171215,sperling,0.970833,0.037439,0.036347
167278,spellings,0.955556,0.029815,0.02849


In [122]:
numeric_soundex_correction("speling", vocab.keys(), jellyfish.jaro_winkler_similarity, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["soundex"] = 1 - soundex / np.sum(soundex)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["wgt_distance"] = top_words["distance"] * top_words["soundex"]


Unnamed: 0,words,distance,soundex,wgt_distance
189762,spelling,0.975,1.0,0.975
167278,spellings,0.955556,1.0,0.955556
171215,sperling,0.970833,0.666667,0.647222
177635,spewing,0.933333,0.666667,0.622222
196330,sleeping,0.919643,0.666667,0.613095


## 1.3. Running the correction methods on the data

### 1.3.1 - Get the typos:

In [123]:
with open(DATA_PATH + r"\typo-0.2.txt", "r", encoding=encoding) as file:
    text = file.read()

typo_pattern = r'<typo orig="([^"]+)">([^<]+)</typo>'
typos = re.findall(typo_pattern, text)
typos = pd.DataFrame(typos, columns=["Word", "Typo"])


typos.tail(10)

Unnamed: 0,Word,Typo
3173,defy,deefy
3174,translation,translatmion
3175,to,tho
3176,But,ut
3177,in,ini
3178,keep,kpeep
3179,game,gme
3180,winds,wantagh
3181,this,tsi
3182,of,o


In [124]:
s1 = set(typos["Word"])
s2 = set(vocab.keys())
inter = s2.intersection(s1)

print(f"There are {len(typos['Word'].unique())} unique original words and {len(pd.Series(vocab.keys()).unique())} unique words in the vocabulary. Out of these two, {len(inter)} intersect.")


There are 1601 unique original words and 201315 unique words in the vocabulary. Out of these two, 1252 intersect.


### 1.3.2 - Apply correction methods: 


In [130]:
def get_correction_df(path, typos, func, minimum = True, n_neighbors=1):

    if os.path.isfile(path):
        final_df = pd.read_csv(path, index_col = 0)

    else: 
        
        final_df = typos.copy()
        rows = pd.DataFrame() 

        for typo in tqdm(typos["Typo"].tolist(), desc="Correcting Typos", total=len(typos)):

            corrections = generic_distance_correction(typo, vocab.keys(), func, minimum, n_neighbors)

            new_row = corrections.transpose().reset_index(drop=True)
            new_row.columns = [str(i) for i in range(len(new_row.columns))]
            new_row = pd.concat([new_row.iloc[0], new_row.iloc[1]], axis=0).reset_index(drop=True)
            new_row = pd.DataFrame(new_row.values.flatten()).T 
            
            row = pd.DataFrame()
            for col in new_row.columns:
                if int(col) > 4 : 
                    row[f"distance {col-n_neighbors}"] = new_row[col]
                else: 
                    row[f"correction {col}"] = new_row[col]

            rows = pd.concat([rows, row], axis=0).reset_index(drop=True)

        final_df = pd.concat([final_df, rows], axis=1)

        final_df.to_csv(path)
    
    return final_df


def get_weighted_ed_correction_df(path, typos, unigram = True, n_neighbors=1):

    if os.path.isfile(path):
        final_df = pd.read_csv(path, index_col = 0)

    else: 
        
        final_df = typos.copy()
        rows = pd.DataFrame() 

        for typo in tqdm(typos["Typo"].tolist(), desc="Correcting Typos", total=len(typos)):
            
            if unigram:
                corrections = edDistance_unigram_correction(typo, vocab.keys(), n_neighbors)
            else:
                corrections = edDistance_soundex_correction(typo, vocab.keys(), n_neighbors)

            new_row = corrections.transpose().reset_index(drop=True)
            new_row.columns = [str(i) for i in range(len(new_row.columns))]
            new_row = pd.concat([new_row.iloc[0], new_row.iloc[3]], axis=0).reset_index(drop=True)
            new_row = pd.DataFrame(new_row.values.flatten()).T 
            
            row = pd.DataFrame()
            for col in new_row.columns:
                if int(col) > 4 : 
                    row[f"distance {col-n_neighbors}"] = new_row[col]
                else: 
                    row[f"correction {col}"] = new_row[col]

            rows = pd.concat([rows, row], axis=0).reset_index(drop=True)

        final_df = pd.concat([final_df, rows], axis=1)

        final_df.to_csv(path)
    
    return final_df


def get_weighted_numeric_correction_df(path, typos, func, unigram = True, n_neighbors=1):

    if os.path.isfile(path):
        final_df = pd.read_csv(path, index_col = 0)

    else: 
        
        final_df = typos.copy()
        rows = pd.DataFrame() 

        for typo in tqdm(typos["Typo"].tolist(), desc="Correcting Typos", total=len(typos)):
            
            if unigram:
                corrections = numeric_unigram_correction(typo, vocab.keys(), func, n_neighbors)
            else:
                corrections = numeric_soundex_correction(typo, vocab.keys(), func, n_neighbors)

            new_row = corrections.transpose().reset_index(drop=True)
            new_row.columns = [str(i) for i in range(len(new_row.columns))]
            new_row = pd.concat([new_row.iloc[0], new_row.iloc[3]], axis=0).reset_index(drop=True)
            new_row = pd.DataFrame(new_row.values.flatten()).T 
            
            row = pd.DataFrame()
            for col in new_row.columns:
                if int(col) > 4 : 
                    row[f"distance {col-n_neighbors}"] = new_row[col]
                else: 
                    row[f"correction {col}"] = new_row[col]

            rows = pd.concat([rows, row], axis=0).reset_index(drop=True)

        final_df = pd.concat([final_df, rows], axis=1)

        final_df.to_csv(path)
    
    return final_df

**Jaro**

Took 9m21.8s on Louis' machine

In [131]:
jaro_correction_df = get_correction_df(DATA_PATH + "\jaro_correction_df.csv", typos, jellyfish.jaro_similarity, False, 5)
jaro_correction_df

Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,wealthy,wealth,welty,healthy,watley,0.958333,0.916667,0.875000,0.869048,0.861111
1,afford,aford,alford,afford,axford,ford,watford,0.944444,0.944444,0.944444,0.933333,0.904762
2,Catholic,CatholiaCtholic,athol,palaeolithic,pathological,alghaithi,toit,0.777778,0.766667,0.766667,0.765741,0.755556
3,cousins,coxusins,cousins,cousin,compulsions,coursing,coxswain,0.958333,0.916667,0.837121,0.833333,0.833333
4,masks,mmasks,masks,mask,mass,asks,unmasks,0.944444,0.888889,0.888889,0.888889,0.849206
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,peep,keep,upkeep,kpe,peeps,0.933333,0.933333,0.877778,0.866667,0.866667
3179,game,gme,gme,game,gmes,gome,gamel,1.000000,0.916667,0.916667,0.916667,0.866667
3180,winds,wantagh,wantagh,wantage,wata,want,anta,1.000000,0.904762,0.857143,0.857143,0.857143
3181,this,tsi,tsi,atsi,tsoi,gtsi,tasi,1.000000,0.916667,0.916667,0.916667,0.916667


Took 12mins

In [None]:
jaroUni_correction_df = get_weighted_numeric_correction_df(DATA_PATH + "\jaroUni_correction_df.csv", typos, jellyfish.jaro_similarity, True, 5)
jaroUni_correction_df

Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,healthy,wealth,wealthy,watley,welty,0.391646,0.292728,0.218829,0.000785,0.000651
1,afford,aford,ford,afford,watford,alford,axford,0.534238,0.371619,0.024373,0.006421,0.000364
2,Catholic,CatholiaCtholic,pathological,toit,athol,palaeolithic,alghaithi,0.443006,0.226283,0.045403,0.030485,0.018787
3,cousins,coxusins,cousin,cousins,coursing,coxswain,compulsions,0.596112,0.292694,0.024927,0.006355,0.005640
4,masks,mmasks,mass,asks,mask,masks,unmasks,0.600665,0.153793,0.069255,0.068983,0.000240
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,keep,upkeep,peep,peeps,kpe,0.923348,0.004131,0.003012,0.001539,0.000858
3179,game,gme,game,gome,gamel,gme,gmes,0.915274,0.000907,0.000249,0.000155,0.000080
3180,winds,wantagh,want,wantage,wantagh,wata,anta,0.856241,0.000611,0.000204,0.000078,0.000071
3181,this,tsi,tsi,gtsi,tsoi,tasi,atsi,0.478469,0.236842,0.092105,0.078947,0.070175


Took 14m40.0s

In [144]:
jaroSdx_correction_df = get_weighted_numeric_correction_df(DATA_PATH + "\jaroSdx_correction_df.csv", typos, jellyfish.jaro_similarity, False, 5)
jaroSdx_correction_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["soundex"] = 1 - soundex / np.sum(soundex)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["wgt_distance"] = top_words["distance"] * top_words["soundex"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["soundex"] = 1 - soundex / np.sum(soundex)
A value is trying to be se

Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,wealthy,wealth,welty,healthy,watley,0.958333,0.916667,0.875,0.579365,0.287037
1,afford,aford,afford,alford,axford,ford,watford,0.944444,0.755556,0.755556,0.653333,0.633333
2,Catholic,CatholiaCtholic,pathological,athol,palaeolithic,alghaithi,toit,0.702778,0.648148,0.575,0.574306,0.566667
3,cousins,coxusins,cousins,coursing,cousin,coxswain,compulsions,0.766667,0.75,0.733333,0.666667,0.585985
4,masks,mmasks,masks,mask,mass,asks,unmasks,0.944444,0.888889,0.888889,0.666667,0.212302
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,keep,kpe,peep,peeps,upkeep,0.82963,0.77037,0.725926,0.674074,0.585185
3179,game,gme,gme,game,gome,gmes,gamel,1.0,0.916667,0.916667,0.458333,0.433333
3180,winds,wantagh,wantagh,wantage,want,anta,wata,1.0,0.904762,0.714286,0.571429,0.428571
3181,this,tsi,tsi,tsoi,tasi,atsi,gtsi,1.0,0.916667,0.916667,0.458333,0.458333


**Jaro-Winkler**

Took 8m53.0s on Louis' machine

In [145]:
jw_correction_df = get_correction_df(DATA_PATH + "\jw_correction_df.csv", typos, jellyfish.jaro_winkler_similarity, False, 5)
jw_correction_df

Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,wealthy,wealth,weal,wealthtv,welty,0.975000,0.950000,0.900000,0.900000,0.900000
1,afford,aford,afford,axford,alford,ford,affords,0.955556,0.950000,0.950000,0.933333,0.923810
2,Catholic,CatholiaCtholic,athol,palaeolithic,pathological,alghaithi,toit,0.777778,0.766667,0.766667,0.765741,0.755556
3,cousins,coxusins,cousins,cousin,coxswain,compulsions,coun,0.966667,0.933333,0.883333,0.869697,0.866667
4,masks,mmasks,masks,mask,mass,asks,mma,0.950000,0.900000,0.900000,0.888889,0.883333
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,keep,peep,kpe,keeps,kee,0.940000,0.933333,0.906667,0.880000,0.880000
3179,game,gme,gme,gmes,game,gome,gomer,1.000000,0.941667,0.925000,0.925000,0.880000
3180,winds,wantagh,wantagh,wantage,want,wang,wana,1.000000,0.942857,0.914286,0.900000,0.900000
3181,this,tsi,tsi,tsim,tsui,tsai,tsoi,1.000000,0.941667,0.933333,0.933333,0.933333


Took 13m6s

In [146]:
jwUni_correction_df = get_weighted_numeric_correction_df(DATA_PATH + "\jwUni_correction_df.csv", typos, jellyfish.jaro_winkler_similarity, True, 5)
jwUni_correction_df

Correcting Typos:   0%|          | 0/3183 [00:00<?, ?it/s]

Correcting Typos: 100%|██████████| 3183/3183 [13:05<00:00,  4.05it/s]


Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,wealth,wealthy,welty,wealthtv,weal,0.552625,0.405552,0.00122,0.000551,0.000334
1,afford,aford,ford,afford,affords,alford,axford,0.544633,0.383306,0.007396,0.006585,0.000373
2,Catholic,CatholiaCtholic,pathological,toit,athol,palaeolithic,alghaithi,0.443006,0.226283,0.045403,0.030485,0.018787
3,cousins,coxusins,cousin,cousins,coxswain,coun,compulsions,0.620832,0.301992,0.006891,0.006695,0.005994
4,masks,mmasks,mass,asks,mask,masks,mma,0.602861,0.152449,0.069508,0.068783,0.007964
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,keep,keeps,peep,kpe,kee,0.851026,0.079261,0.002756,0.000821,0.000638
3179,game,gme,game,gome,gomer,gme,gmes,0.923628,0.000916,0.000221,0.000155,0.000082
3180,winds,wantagh,want,wang,wana,wantage,wantagh,0.88237,0.029613,0.00104,0.000615,0.000197
3181,this,tsi,tsai,tsui,tsi,tsim,tsoi,0.532955,0.201515,0.142045,0.03879,0.027841


In [147]:
jwSdx_correction_df = get_weighted_numeric_correction_df(DATA_PATH + "\jwSdx_correction_df.csv", typos, jellyfish.jaro_winkler_similarity, False, 5)
jwSdx_correction_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["soundex"] = 1 - soundex / np.sum(soundex)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["wgt_distance"] = top_words["distance"] * top_words["soundex"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_words["soundex"] = 1 - soundex / np.sum(soundex)
A value is trying to be se

**Edit Distance**

Took 19m10.3s on Louis' machine

In [None]:
ed_correction_df = get_correction_df(DATA_PATH + "\ed_correction_df.csv", typos, editdistance.eval, True, 5)
ed_correction_df

Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,wealthy,wealth,healthy,welty,healton,1,2,2,3,3
1,afford,aford,alford,ford,afford,axford,acord,1,1,1,1,1
2,Catholic,CatholiaCtholic,anglo-catholic,paleolithic,pathologic,anti-catholic,catholic-muslim,7,7,7,7,7
3,cousins,coxusins,cousins,cousin,commins,focusing,coulis,1,2,3,3,3
4,masks,mmasks,masks,amass,marks,tasks,masts,1,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,peep,keep,meep,reep,koepp,1,1,2,2,2
3179,game,gme,gme,ime,gue,bme,gfe,0,1,1,1,1
3180,winds,wantagh,wantagh,wantage,wastage,vantage,wattage,0,1,2,2,2
3181,this,tsi,tsi,asi,tti,tst,tsk,0,1,1,1,1


In [None]:
edUni_correction_df = get_weighted_ed_correction_df(DATA_PATH + "\edUni_correction_df.csv", typos, True, 5)
edUni_correction_df

Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,healthy,wealth,wealthy,watley,welty,0.391646,0.292728,0.218829,0.000785,0.000651
1,afford,aford,ford,afford,watford,alford,axford,0.534238,0.371619,0.024373,0.006421,0.000364
2,Catholic,CatholiaCtholic,pathological,toit,athol,palaeolithic,alghaithi,0.443006,0.226283,0.045403,0.030485,0.018787
3,cousins,coxusins,cousin,cousins,coursing,coxswain,compulsions,0.596112,0.292694,0.024927,0.006355,0.005640
4,masks,mmasks,mass,asks,mask,masks,unmasks,0.600665,0.153793,0.069255,0.068983,0.000240
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,keep,upkeep,peep,peeps,kpe,0.923348,0.004131,0.003012,0.001539,0.000858
3179,game,gme,game,gome,gamel,gme,gmes,0.915274,0.000907,0.000249,0.000155,0.000080
3180,winds,wantagh,want,wantage,wantagh,wata,anta,0.856241,0.000611,0.000204,0.000078,0.000071
3181,this,tsi,tsi,gtsi,tsoi,tasi,atsi,0.478469,0.236842,0.092105,0.078947,0.070175


In [None]:
edSdx_correction_df = get_weighted_ed_correction_df(DATA_PATH + "\edSdx_correction_df.csv", typos, False, 5)
edSdx_correction_df

Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,healthy,wealth,wealthy,watley,welty,0.391646,0.292728,0.218829,0.000785,0.000651
1,afford,aford,ford,afford,watford,alford,axford,0.534238,0.371619,0.024373,0.006421,0.000364
2,Catholic,CatholiaCtholic,pathological,toit,athol,palaeolithic,alghaithi,0.443006,0.226283,0.045403,0.030485,0.018787
3,cousins,coxusins,cousin,cousins,coursing,coxswain,compulsions,0.596112,0.292694,0.024927,0.006355,0.005640
4,masks,mmasks,mass,asks,mask,masks,unmasks,0.600665,0.153793,0.069255,0.068983,0.000240
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,keep,upkeep,peep,peeps,kpe,0.923348,0.004131,0.003012,0.001539,0.000858
3179,game,gme,game,gome,gamel,gme,gmes,0.915274,0.000907,0.000249,0.000155,0.000080
3180,winds,wantagh,want,wantage,wantagh,wata,anta,0.856241,0.000611,0.000204,0.000078,0.000071
3181,this,tsi,tsi,gtsi,tsoi,tasi,atsi,0.478469,0.236842,0.092105,0.078947,0.070175


## 1.4 Return the text file with the corrections 

### 1.4.1 Functions

In [None]:
def format_correction(typo_row):
    name = typo_row["Word"]
    typo = typo_row['Typo']
    
    #print(typo_row)
    
    neigh0 = typo_row["correction 0"]
    neigh1 = typo_row["correction 1"]
    neigh2 = typo_row["correction 2"]
    neigh3 = typo_row["correction 3"]
    neigh4 = typo_row["correction 4"]
    return f"<correction orig=\"{name}\" typo=\"{typo}\">{neigh0} {neigh1} {neigh2} {neigh3} {neigh4}</correction>"

def format_typo(typo_row):
    orig = typo_row["Word"]
    typo = typo_row['Typo']
    return f'<typo orig="{orig}">{typo}</typo>'

def replace_typos(path, typo_df, typos_str): 
 
    formatted_corr = typo_df.apply(format_correction, axis=1)
    formatted_typo = typo_df.apply(format_typo, axis=1)

    formatted_dict = dict(zip(formatted_typo, formatted_corr))

    # Erase the contents of the file if it already exists
    if os.path.isfile(path):
        os.remove(path)

    for typo_pattern in formatted_dict.keys():
        correction_pattern = formatted_dict[typo_pattern]
        typos_str = re.sub(typo_pattern, correction_pattern, typos_str)

    # Write it to a .txt file
    with open(path, "a") as out_file:
        out_file.write(typos_str)

### 1.4.2 Replace typos for all distances

In [None]:
typos_file = open(DATA_PATH + r"\typo-0.2.txt").read()

**Jaro**

In [None]:
replace_typos(DATA_PATH + "\jaro_corrections-0.2.txt", jaro_correction_df, typos_file)
replace_typos(DATA_PATH + "\jaroUni_corrections-0.2.txt", jaroUni_correction_df, typos_file)
replace_typos(DATA_PATH + "\jaroSdx_corrections-0.2.txt", jaroSdx_correction_df, typos_file)

**Jaro-Winkler**

In [None]:
replace_typos(DATA_PATH + "\jw_corrections-0.2.txt", jw_correction_df, typos_file)
replace_typos(DATA_PATH + "\jwUni_corrections-0.2.txt", jwUni_correction_df, typos_file)
replace_typos(DATA_PATH + "\jwSdx_corrections-0.2.txt", jwSdx_correction_df, typos_file)

**Edit Distance**

In [None]:
replace_typos(DATA_PATH + "\ed_corrections-0.2.txt", ed_correction_df, typos_file)
replace_typos(DATA_PATH + "\edUni_corrections-0.2.txt", edUni_correction_df, typos_file)
replace_typos(DATA_PATH + "\edSdx_corrections-0.2.txt", edSdx_correction_df, typos_file)

# 2. Create evaluation metrics
Here's a couple of ideas:

- *Hard* accuracy: Does the first word match the original word?
- *Soft* accuracy: Is the original word in one of the neighbors? 

In [64]:
def evaluate_correction(path):
    corrected_file = open(path).read()
    correction_pattern = r'<correction.*?</correction>'
    matches = re.findall(correction_pattern, corrected_file)

    hardacc = []
    softacc = []
    extraction_pattern = '<correction orig="|" typo="|">|</correction>'
    for correction in matches: 
        subbed_corr = re.sub(extraction_pattern, " ", correction) # Remove all the fluff
        extracted_words = subbed_corr.split(" ")[1:-1] # remove the frst and last splits, which will always be empty

        original = extracted_words[0] 
        corrected = extracted_words[2:]

        hardacc.append(original == corrected[0])
        softacc.append(original in corrected)

    hard_accuracy = sum(hardacc) / len(hardacc)
    soft_accuracy = sum(softacc) / len(softacc)

    print(f"Hard accuracy: {round(hard_accuracy, 2)}\nSoft accuracy: {round(soft_accuracy, 2)}")
        
    return hard_accuracy, soft_accuracy

In [None]:
corr_path = DATA_PATH + r"\edSdx_corrections-0.2.txt"
hard, soft = evaluate_correction(corr_path)

Hard accuracy: 0.24
Soft accuracy: 0.38
