# 0. Initialisation

## 0.1 Imports

In [68]:
import pandas as pd
import numpy as np
from collections import Counter
import chardet
import editdistance #
import jellyfish
import re
from tqdm import tqdm
import os

#DATA_PATH = r"C:\Users\Louis\Documents\University\Masters\A23\NLP\Devoirs\data\hw2"
DATA_PATH = r"C:\Users\barka\Desktop\NLP"

## 0.2 Functions

In [69]:
def detect_encoding(file_path):

    with open(file_path, 'rb') as file:
        rawdata = file.read()
    result = chardet.detect(rawdata)
    return result['encoding']

def get_word_counter(file_path, encoding):
    word_counter = Counter()

    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            
            parts = line.split()
            if len(parts) >= 2:
                word = parts[1]
                
                word_counter[word.strip()] += 1

    # Return the Counter containing word frequencies
    return word_counter


## 0.3 Check data quality
Make sure Python is reading the lines correctly. And it does!

In [70]:
# Check ypos
typos_file = open(DATA_PATH + r"\typo-0.2.txt")

for i, row in enumerate(typos_file):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 1000. Correct number? {i+1==1000}")

# Check vocabulary

encoding = detect_encoding(DATA_PATH + r"\voc-1bwc.txt") #check encoding
print(f"Encoding : {encoding}")

voc = open(DATA_PATH + r"\voc-1bwc.txt", encoding=encoding)

for i, row in enumerate(voc):
    #print(row)
    pass

print(f"There are {i+1} rows in the file. There should be 201 315. Correct number? {i+1==201315}")

There are 1000 rows in the file. There should be 1000. Correct number? True
Encoding : utf-8
There are 201315 rows in the file. There should be 201 315. Correct number? True


How many words in the vocabulary?

In [71]:
vocab = get_word_counter(DATA_PATH + r"\voc-1bwc.txt", encoding)
print(len(vocab)) # just to verify

201315


# 1. Implementing various distances

### 1.1 Generic Distance function

Works with any function that takes two words as an input. 

In [72]:
'''This function applies a generic function over an entire vocabulary to compare the distance between a given word
and every word inside the vocabulary. It then returns the n_neighbors most similar words.

Parameters:
    word: The word to find neighbors to.
    vocabulary: A list (or list-like) of the vocabulary.
    func: The distance function to apply without arguments (without parentheses).
    minimum: Do we want the minimum distance? Boolean. True by default, will yield the minimum distance. If False, the function will yield the maximum distance. 
    n_neighbors: The number of most similar words to return.

Returns:
    vocabulary.head: A dataframe containing the n_neighbors most similar words to the input word, with the distances. 
'''
def generic_distance_correction(word, vocabulary, func, minimum = True, n_neighbors=1):
    vocabulary = pd.DataFrame(vocabulary, columns=["words"])

    def calculateDistance(series_word):
        return func(series_word, word)
    
    distances = vocabulary["words"].apply(calculateDistance)
    
    vocabulary["distance"] = distances

    if minimum:
        vocabulary = vocabulary.sort_values("distance", ascending=True)
    else: 
        vocabulary = vocabulary.sort_values("distance", ascending=False)

    return vocabulary.head(n_neighbors)

## 1.2. Testing the correction methods

In [73]:
generic_distance_correction('speling', vocab.keys(), editdistance.eval, True, 5) #Edit distance

Unnamed: 0,words,distance
171215,sperling,1
177635,spewing,1
189762,spelling,1
131284,pelling,2
180918,sewing,2


In [74]:
generic_distance_correction('speling', vocab.keys(), jellyfish.jaro_similarity, False, 5)

Unnamed: 0,words,distance
189762,spelling,0.958333
171215,sperling,0.958333
167278,spellings,0.925926
196330,sleeping,0.910714
200045,selling,0.904762


In [75]:
generic_distance_correction('speling', vocab.keys(), jellyfish.jaro_winkler_similarity, False, 5)

Unnamed: 0,words,distance
189762,spelling,0.975
171215,sperling,0.970833
167278,spellings,0.955556
177635,spewing,0.933333
196330,sleeping,0.919643


## 1.3. Running the correction methods on the data

### 1.3.1 - Get the typos:

In [95]:
with open(DATA_PATH + r"\typo-0.2.txt", "r", encoding=encoding) as file:
    text = file.read()


typo_pattern = r'<typo orig="([^"]+)">([^<]+)</typo>'


typos = re.findall(typo_pattern, text)


typos = pd.DataFrame(typos, columns=["Word", "Typo"])


typos.tail(10)

Unnamed: 0,Word,Typo
3173,defy,deefy
3174,translation,translatmion
3175,to,tho
3176,But,ut
3177,in,ini
3178,keep,kpeep
3179,game,gme
3180,winds,wantagh
3181,this,tsi
3182,of,o


### 1.3.2 - Apply correction methods: 


In [170]:
def get_correction_df(path, typos, func, minimum = True, n_neighbors=1):

    if os.path.isfile(path):
        final_df = pd.read_csv(path, index_col = 0)

    else: 
        
        final_df = typos.copy()
        rows = pd.DataFrame() 


        for i, typo in tqdm(enumerate(typos["Typo"].tolist()), desc="Correcting Typos", total=len(typos)):
            
            
            
            corrections = generic_distance_correction(typo, vocab.keys(), func, minimum, n_neighbors)
            new_row = corrections.transpose().reset_index(drop=True)
            
            new_row.columns = [str(i) for i in range(len(new_row.columns))]
            
            new_row = pd.concat([new_row.iloc[0], new_row.iloc[1]], axis=0)
            new_row = new_row.reset_index(drop=True)
            
            new_row = pd.DataFrame(new_row)
            
            row = pd.DataFrame()
            
            print(new_row)
            
            for col in new_row.columns:
                
                if int(col) > 4 : row[f"d{col-n_neighbors}"] = new_row[col]
                    
                else: row[col] = new_row[col]
            
            
            if i == 10 : break
                
            rows = pd.concat([rows, row], axis=1)
            
        
        print(rows)
        
        final_df = pd.concat([final_df, rows], axis=0)
            
        print(final_df)
        

        final_df.to_csv(path)
    
    return final_df


**Jaro**

In [171]:
jaro_correction_df = get_correction_df(DATA_PATH + "\jaro_correction_df.csv", typos, jellyfish.jaro_similarity, False, 5)
jaro_correction_df

Correcting Typos:   0%|                                                               | 1/3183 [00:00<14:14,  3.72it/s]

          0
0   wealthy
1    wealth
2     welty
3   healthy
4    watley
5  0.958333
6  0.916667
7     0.875
8  0.869048
9  0.861111
          0
0    axford
1    afford
2    alford
3      ford
4   walford
5  0.944444
6  0.944444
7  0.944444
8  0.933333
9  0.904762


Correcting Typos:   0%|                                                               | 3/3183 [00:00<12:05,  4.38it/s]

              0
0         athol
1  pathological
2  palaeolithic
3     alghaithi
4          holi
5      0.777778
6      0.766667
7      0.766667
8      0.765741
9      0.755556


Correcting Typos:   0%|                                                               | 5/3183 [00:01<11:46,  4.50it/s]

             0
0      cousins
1       cousin
2  compulsions
3     coxswain
4     cushions
5     0.958333
6     0.916667
7     0.837121
8     0.833333
9     0.833333
          0
0     masks
1      asks
2      mass
3      mask
4   unmasks
5  0.944444
6  0.888889
7  0.888889
8  0.888889
9  0.849206


Correcting Typos:   0%|▏                                                              | 7/3183 [00:01<11:03,  4.79it/s]

          0
0       tos
1      tons
2      toms
3      toss
4      taos
5       1.0
6  0.916667
7  0.916667
8  0.916667
9  0.916667
          0
0       bac
1       bsa
2       bro
3       bks
4       ble
5  0.777778
6  0.777778
7  0.777778
8  0.777778
9  0.777778


Correcting Typos:   0%|▏                                                              | 8/3183 [00:01<11:23,  4.65it/s]

          0
0   rockers
1   brokers
2     roker
3   workers
4   yorkers
5  0.952381
6  0.952381
7  0.944444
8  0.896825
9  0.896825


Correcting Typos:   0%|▏                                                              | 9/3183 [00:02<12:20,  4.28it/s]

          0
0      tthe
1     tithe
2     tothe
3       the
4    tithes
5       1.0
6  0.933333
7  0.933333
8  0.916667
9  0.888889


Correcting Typos:   0%|▏                                                             | 10/3183 [00:02<13:27,  3.93it/s]

             0
0   government
1  government-
2  governments
3    governent
4   governemnt
5          1.0
6     0.969697
7     0.969697
8     0.966667
9     0.966667
          0
0      fist
1     feist
2     fists
3     foist
4     first
5       1.0
6  0.933333
7  0.933333
8  0.933333
9  0.933333





InvalidIndexError: Reindexing only valid with uniquely valued Index objects

**Jaro-Winkler**

In [144]:
jw_correction_df = get_correction_df(DATA_PATH + "\jw_correction_df.csv", typos, jellyfish.jaro_winkler_similarity, False, 5)
jw_correction_df

Correcting Typos: 100%|████████████████████████████████████████████████████████████| 3183/3183 [12:28<00:00,  4.25it/s]


Unnamed: 0,Word,Typo,0,1,2,3,4
0,wealthy,wealtohy,wealthy,wealth,welty,weal,wealthtv
1,afford,aford,afford,alford,axford,ford,affords
2,Catholic,CatholiaCtholic,athol,pathological,palaeolithic,alghaithi,holi
3,cousins,coxusins,cousins,cousin,coxswain,compulsions,cosiness
4,masks,mmasks,masks,mask,mass,asks,mma
...,...,...,...,...,...,...,...
3178,keep,kpeep,keep,peep,kpe,kee,keeps
3179,game,gme,gme,gmes,game,gome,gomez
3180,winds,wantagh,wantagh,wantage,want,wana,wang
3181,this,tsi,tsi,tsim,tsoi,tsai,tsui


**Edit Distance**

In [145]:
ed_correction_df = get_correction_df(DATA_PATH + "\ed_correction_df.csv", typos, editdistance.eval, True, 5)
ed_correction_df

Correcting Typos:  22%|█████████████▏                                               | 691/3183 [06:24<23:06,  1.80it/s]


KeyboardInterrupt: 

## 1.4 Return the text file with the corrections 

### 1.4.1 Functions

In [None]:
def format_correction(typo_row):
    name = typo_row["Word"]
    typo = typo_row['Typo']
    neigh0 = typo_row[0]
    neigh1 = typo_row[1]
    neigh2 = typo_row[2]
    neigh3 = typo_row[3]
    neigh4 = typo_row[4]
    return f"<correction orig=\"{name}\" typo=\"{typo}\">{neigh0} {neigh1} {neigh2} {neigh3} {neigh4}</correction>"

def format_typo(typo_row):
    orig = typo_row["Word"]
    typo = typo_row['Typo']
    return f'<typo orig="{orig}">{typo}</typo>'

def replace_typos(path, typo_df, typos_str): 
 
    formatted_corr = typo_df.apply(format_correction, axis=1)
    formatted_typo = typo_df.apply(format_typo, axis=1)

    formatted_dict = dict(zip(formatted_typo, formatted_corr))

    # Erase the contents of the file if it already exists
    if os.path.isfile(path):
        os.remove(path)

    for typo_pattern in formatted_dict.keys():
        correction_pattern = formatted_dict[typo_pattern]
        typos_str = re.sub(typo_pattern, correction_pattern, typos_str)

    # Write it to a .txt file
    with open(path, "a") as out_file:
        out_file.write(typos_str)

### 1.4.2 Replace typos for all distances

In [None]:
typos_file = open(DATA_PATH + r"\typo-0.2.txt").read()

**Jaro**

In [None]:
replace_typos(DATA_PATH + "\jaro_corrections-0.2.txt", jaro_typo_df, typos_file)

**Jaro-Winkler**

In [None]:
replace_typos(DATA_PATH + "\jw_corrections-0.2.txt", jw_typo_df, typos_file)

**Edit Distance**

In [None]:
replace_typos(DATA_PATH + "\ed_corrections-0.2.txt", ed_typo_df, typos_file)