# 0. Initialisation

## 0.1 Imports

In [260]:
import pandas as pd
import numpy as np
from collections import Counter
import chardet
import editdistance #
import jellyfish
import re
from tqdm import tqdm
import os
import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#DATA_PATH = r"C:\Users\Louis\Documents\University\Masters\A23\NLP\Devoirs\data\hw2"
DATA_PATH = r"C:\Users\barka\Desktop\NLP"



## 0.2 Functions

In [281]:
def detect_encoding(file_path):

    with open(file_path, 'rb') as file:
        rawdata = file.read()
    result = chardet.detect(rawdata)
    return result['encoding']

def get_word_counter(file_path, encoding):
    word_counter = Counter()

    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            parts = line.split()
            if len(parts) >= 2:
                word = parts[1]
                count = int(parts[0]) 
                word_counter[word.strip()] = count

    return word_counter


def build_unigram_model(word_counter):
    unigram_model = {}

    total_words = sum(word_counter.values())
    


    for word, count in word_counter.items():
        probability = count / total_words

        unigram_model[word] = probability

    return unigram_model

## 0.3 Check data quality
Make sure Python is reading the lines correctly. And it does!

In [277]:
# Check ypos
typos_file = open(DATA_PATH + r"\typo-0.2.txt")

for i, row in enumerate(typos_file):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 1000. Correct number? {i+1==1000}")

# Check vocabulary

encoding = detect_encoding(DATA_PATH + r"\voc-1bwc.txt") #check encoding
print(f"Encoding : {encoding}")

voc = open(DATA_PATH + r"\voc-1bwc.txt", encoding=encoding)

for i, row in enumerate(voc):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 201 315. Correct number? {i+1==201315}")

There are 1000 rows in the file. There should be 1000. Correct number? True
Encoding : utf-8
There are 201315 rows in the file. There should be 201 315. Correct number? True


How many words in the vocabulary?

In [278]:
vocab = get_word_counter(DATA_PATH + r"\voc-1bwc.txt", encoding)
print(len(vocab)) # just to verify

201315


In [279]:
vocab["the"]

41223601

In [282]:
unigram_model = build_unigram_model(vocab)

In [283]:
unigram_model["the"]

0.07755452743807591

# 1. Implementing various distances

### 1.1 Generic Distance function

Works with any function that takes two words as an input. 

In [72]:
'''This function applies a generic function over an entire vocabulary to compare the distance between a given word
and every word inside the vocabulary. It then returns the n_neighbors most similar words.

Parameters:
    word: The word to find neighbors to.
    vocabulary: A list (or list-like) of the vocabulary.
    func: The distance function to apply without arguments (without parentheses).
    minimum: Do we want the minimum distance? Boolean. True by default, will yield the minimum distance. If False, the function will yield the maximum distance. 
    n_neighbors: The number of most similar words to return.

Returns:
    vocabulary.head: A dataframe containing the n_neighbors most similar words to the input word, with the distances. 
'''
def generic_distance_correction(word, vocabulary, func, minimum = True, n_neighbors=1):
    vocabulary = pd.DataFrame(vocabulary, columns=["words"])

    def calculateDistance(series_word):
        return func(series_word, word)
    
    distances = vocabulary["words"].apply(calculateDistance)
    
    vocabulary["distance"] = distances

    if minimum:
        vocabulary = vocabulary.sort_values("distance", ascending=True)
    else: 
        vocabulary = vocabulary.sort_values("distance", ascending=False)

    return vocabulary.head(n_neighbors)

## 1.2. Testing the correction methods

In [73]:
generic_distance_correction('speling', vocab.keys(), editdistance.eval, True, 5) #Edit distance

Unnamed: 0,words,distance
171215,sperling,1
177635,spewing,1
189762,spelling,1
131284,pelling,2
180918,sewing,2


In [74]:
generic_distance_correction('speling', vocab.keys(), jellyfish.jaro_similarity, False, 5)

Unnamed: 0,words,distance
189762,spelling,0.958333
171215,sperling,0.958333
167278,spellings,0.925926
196330,sleeping,0.910714
200045,selling,0.904762


In [75]:
generic_distance_correction('speling', vocab.keys(), jellyfish.jaro_winkler_similarity, False, 5)

Unnamed: 0,words,distance
189762,spelling,0.975
171215,sperling,0.970833
167278,spellings,0.955556
177635,spewing,0.933333
196330,sleeping,0.919643


## 1.3. Running the correction methods on the data

### 1.3.1 - Get the typos:

In [95]:
with open(DATA_PATH + r"\typo-0.2.txt", "r", encoding=encoding) as file:
    text = file.read()


typo_pattern = r'<typo orig="([^"]+)">([^<]+)</typo>'


typos = re.findall(typo_pattern, text)


typos = pd.DataFrame(typos, columns=["Word", "Typo"])


typos.tail(10)

Unnamed: 0,Word,Typo
3173,defy,deefy
3174,translation,translatmion
3175,to,tho
3176,But,ut
3177,in,ini
3178,keep,kpeep
3179,game,gme
3180,winds,wantagh
3181,this,tsi
3182,of,o


### 1.3.2 - Apply correction methods: 


In [227]:
def get_correction_df(path, typos, func, minimum = True, n_neighbors=1):

    if os.path.isfile(path):
        final_df = pd.read_csv(path, index_col = 0)

    else: 
        
        final_df = typos.copy()
        rows = pd.DataFrame() 


        for i, typo in tqdm(enumerate(typos["Typo"].tolist()), desc="Correcting Typos", total=len(typos)):
            
            
            
            corrections = generic_distance_correction(typo, vocab.keys(), func, minimum, n_neighbors)
            new_row = corrections.transpose().reset_index(drop=True)
            
            #print(new_row)
            
            new_row.columns = [str(i) for i in range(len(new_row.columns))]
            
            #print(new_row)
            
            new_row = pd.concat([new_row.iloc[0], new_row.iloc[1]], axis=0).reset_index(drop=True)
            
            #print(new_row)
            
            new_row = pd.DataFrame(new_row.values.flatten()).T
            
            #print(new_row)
            

            
            #print(new_row)
            
            row = pd.DataFrame()
            
            
            
            
            for col in new_row.columns:
                
                if int(col) > 4 : row[f"distance {col-n_neighbors}"] = new_row[col]
                    
                else: row[f"correction {col}"] = new_row[col]
            
            
                
            rows = pd.concat([rows, row], axis=0).reset_index(drop=True)
            
        
      
        
        final_df = pd.concat([final_df, rows], axis=1)
            
        #print(final_df)
        

        final_df.to_csv(path)
    
    return final_df


**Jaro**

In [228]:
jaro_correction_df = get_correction_df(DATA_PATH + "\jaro_correction_df.csv", typos, jellyfish.jaro_similarity, False, 5)
jaro_correction_df

Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,wealthy,wealth,welty,healthy,watley,0.958333,0.916667,0.875000,0.869048,0.861111
1,afford,aford,axford,afford,alford,ford,walford,0.944444,0.944444,0.944444,0.933333,0.904762
2,Catholic,CatholiaCtholic,athol,pathological,palaeolithic,alghaithi,holi,0.777778,0.766667,0.766667,0.765741,0.755556
3,cousins,coxusins,cousins,cousin,compulsions,coxswain,cushions,0.958333,0.916667,0.837121,0.833333,0.833333
4,masks,mmasks,masks,asks,mass,mask,unmasks,0.944444,0.888889,0.888889,0.888889,0.849206
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,keep,peep,upkeep,pee,kee,0.933333,0.933333,0.877778,0.866667,0.866667
3179,game,gme,gme,gmes,gome,game,gamez,1.000000,0.916667,0.916667,0.916667,0.866667
3180,winds,wantagh,wantagh,wantage,wana,want,wang,1.000000,0.904762,0.857143,0.857143,0.857143
3181,this,tsi,tsi,utsi,tfsi,thsi,etsi,1.000000,0.916667,0.916667,0.916667,0.916667


**Jaro-Winkler**

In [225]:
jw_correction_df = get_correction_df(DATA_PATH + "\jw_correction_df.csv", typos, jellyfish.jaro_winkler_similarity, False, 5)
jw_correction_df

Correcting Typos: 100%|████████████████████████████████████████████████████████████| 3183/3183 [14:52<00:00,  3.57it/s]

  correction 0  correction 1  correction 2 correction 3 correction 4  \
0      wealthy        wealth         welty         weal     wealthtv   
1       afford        alford        axford         ford      affords   
2        athol  pathological  palaeolithic    alghaithi         holi   
3      cousins        cousin      coxswain  compulsions     cosiness   
4        masks          mask          mass         asks          mma   

  distance 0 distance 1 distance 2 distance 3 distance 4  
0      0.975       0.95        0.9        0.9        0.9  
1   0.955556       0.95       0.95   0.933333    0.92381  
2   0.777778   0.766667   0.766667   0.765741   0.755556  
3   0.966667   0.933333   0.883333   0.869697   0.866667  
4       0.95        0.9        0.9   0.888889   0.883333  





Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,wealthy,wealth,welty,weal,wealthtv,0.975,0.95,0.9,0.9,0.9
1,afford,aford,afford,alford,axford,ford,affords,0.955556,0.95,0.95,0.933333,0.92381
2,Catholic,CatholiaCtholic,athol,pathological,palaeolithic,alghaithi,holi,0.777778,0.766667,0.766667,0.765741,0.755556
3,cousins,coxusins,cousins,cousin,coxswain,compulsions,cosiness,0.966667,0.933333,0.883333,0.869697,0.866667
4,masks,mmasks,masks,mask,mass,asks,mma,0.95,0.9,0.9,0.888889,0.883333
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,keep,peep,kpe,kee,keeps,0.94,0.933333,0.906667,0.88,0.88
3179,game,gme,gme,gmes,game,gome,gomez,1.0,0.941667,0.925,0.925,0.88
3180,winds,wantagh,wantagh,wantage,want,wana,wang,1.0,0.942857,0.914286,0.9,0.9
3181,this,tsi,tsi,tsim,tsoi,tsai,tsui,1.0,0.941667,0.933333,0.933333,0.933333


**Edit Distance**

In [226]:
ed_correction_df = get_correction_df(DATA_PATH + "\ed_correction_df.csv", typos, editdistance.eval, True, 5)
ed_correction_df

Correcting Typos: 100%|████████████████████████████████████████████████████████████| 3183/3183 [34:41<00:00,  1.53it/s]

     correction 0     correction 1 correction 2 correction 3 correction 4  \
0         wealthy           wealth      healthy     wealthtv       fealty   
1          afford           alford         ford        afore       axford   
2  anglo-catholic  catholic-muslim   pathologic  paleolithic   shopaholic   
3         cousins           cousin      coupons     housings      coggins   
4           masks            basks       flasks         maks       smacks   

  distance 0 distance 1 distance 2 distance 3 distance 4  
0          1          2          2          3          3  
1          1          1          1          1          1  
2          7          7          7          7          7  
3          1          2          3          3          3  
4          1          2          2          2          2  





Unnamed: 0,Word,Typo,correction 0,correction 1,correction 2,correction 3,correction 4,distance 0,distance 1,distance 2,distance 3,distance 4
0,wealthy,wealtohy,wealthy,wealth,healthy,wealthtv,fealty,1,2,2,3,3
1,afford,aford,afford,alford,ford,afore,axford,1,1,1,1,1
2,Catholic,CatholiaCtholic,anglo-catholic,catholic-muslim,pathologic,paleolithic,shopaholic,7,7,7,7,7
3,cousins,coxusins,cousins,cousin,coupons,housings,coggins,1,2,3,3,3
4,masks,mmasks,masks,basks,flasks,maks,smacks,1,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
3178,keep,kpeep,peep,keep,peed,peeps,keeps,1,1,2,2,2
3179,game,gme,gme,gmb,gmh,fme,gmd,0,1,1,1,1
3180,winds,wantagh,wantagh,wantage,wattage,wastage,vantage,0,1,2,2,2
3181,this,tsi,tsi,asi,rsi,tfi,atsi,0,1,1,1,1


## 1.4 Return the text file with the corrections 

### 1.4.1 Functions

In [250]:
def format_correction(typo_row):
    name = typo_row["Word"]
    typo = typo_row['Typo']
    
    #print(typo_row)
    
    neigh0 = typo_row["correction 0"]
    neigh1 = typo_row["correction 1"]
    neigh2 = typo_row["correction 2"]
    neigh3 = typo_row["correction 3"]
    neigh4 = typo_row["correction 4"]
    return f"<correction orig=\"{name}\" typo=\"{typo}\">{neigh0} {neigh1} {neigh2} {neigh3} {neigh4}</correction>"

def format_typo(typo_row):
    orig = typo_row["Word"]
    typo = typo_row['Typo']
    return f'<typo orig="{orig}">{typo}</typo>'

def replace_typos(path, typo_df, typos_str): 
 
    formatted_corr = typo_df.apply(format_correction, axis=1)
    formatted_typo = typo_df.apply(format_typo, axis=1)

    formatted_dict = dict(zip(formatted_typo, formatted_corr))

    # Erase the contents of the file if it already exists
    if os.path.isfile(path):
        os.remove(path)

    for typo_pattern in formatted_dict.keys():
        correction_pattern = formatted_dict[typo_pattern]
        typos_str = re.sub(typo_pattern, correction_pattern, typos_str)

    # Write it to a .txt file
    with open(path, "a") as out_file:
        out_file.write(typos_str)

### 1.4.2 Replace typos for all distances

In [251]:
typos_file = open(DATA_PATH + r"\typo-0.2.txt").read()

**Jaro**

In [252]:
replace_typos(DATA_PATH + "\jaro_corrections-0.2.txt", jaro_correction_df, typos_file)

**Jaro-Winkler**

In [253]:
replace_typos(DATA_PATH + "\jw_corrections-0.2.txt", jw_correction_df, typos_file)

**Edit Distance**

In [254]:
replace_typos(DATA_PATH + "\ed_corrections-0.2.txt", ed_correction_df, typos_file)

# 2. Create evaluation metrics
Here's a couple of ideas:

- *Hard* accuracy: Does the first word match the original word?
- *Soft* accuracy: Is the original word in one of the neighbors? 

In [255]:
def evaluate_correction(path):
    corrected_file = open(path).read()
    correction_pattern = r'<correction.*?</correction>'
    matches = re.findall(correction_pattern, corrected_file)

    hardacc = []
    softacc = []
    extraction_pattern = '<correction orig="|" typo="|">|</correction>'
    for correction in matches: 
        subbed_corr = re.sub(extraction_pattern, " ", correction) # Remove all the fluff
        extracted_words = subbed_corr.split(" ")[1:-1] # remove the frst and last splits, which will always be empty

        original = extracted_words[0] 
        corrected = extracted_words[2:]

        hardacc.append(original == corrected[0])
        softacc.append(original in corrected)

    hard_accuracy = sum(hardacc) / len(hardacc)
    soft_accuracy = sum(softacc) / len(softacc)

    print(f"Hard accuracy: {round(hard_accuracy, 2)}\nSoft accuracy: {round(soft_accuracy, 2)}")
        
    return hard_accuracy, soft_accuracy

In [256]:
corr_path = DATA_PATH + r"\ed_corrections-0.2.txt"
hard, soft = evaluate_correction(corr_path)


Hard accuracy: 0.23
Soft accuracy: 0.38
