# Data preprocessing

In this notebook, we reshape all our training data into a useful format.

In [None]:
import pandas as pd
import numpy as np
from Levenshtein import distance
import re
import time
import pickle as p
import random
from nltk import ngrams
import sys
#from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences

In [None]:
def lev_norm(str_1, str_2):
    """
    Takes two strings and calculates the normalized Lev. distance between them.
    
    params:
        str_1: a string
        str_2: a string
    
    returns:
        lev_norm: a number between 0 and 1
    """
    dist = distance(str_1, str_2)
    denominator = max(len(str_1), len(str_2))
    
    if denominator == 0:
        return 0
    else:
        lev_norm = dist / denominator
        return lev_norm

In [None]:
def n_gram_regex(word, n = 2, ceiling = 4):
    """
    Creates a regex for blocking the data frame. It splits the word into
    ngrams, and then will look for the first ceiling ngrams among the first
    ceiling + 1 characters in the searched string.
    """
    if len(word) < 2:
        return "no_regex"
    else:
        grams = ngrams(list(word), n) # Outputs a generator of tuples
        try:
            bigrams = [''.join(x) for x in grams] # Join each tuple into a string
        except:
            print(f'Regex error! List of ngrams = {grams}')
        to_keep = min(ceiling, len(bigrams)) # Keep ceilng or less of the bigrams
        kept = '|'.join(bigrams[:to_keep]) # Join them into string
        wrapped = '^.{,' + str(to_keep) + '}(?:' + kept + ')'
        try:
            regex = re.compile(wrapped, flags = re.IGNORECASE|re.UNICODE) # Paste all together into a regex
        except:
            return "no_regex"

        return regex

In [None]:
lev_norm("parlil","barllil")

## Section 1: Howitt-Fison data

In [None]:
path = 'from_the_page_out.csv'

with open(path, 'r') as f:
    how_fis = pd.read_csv(f)

In [None]:
how_fis['Category'].unique()

In [None]:
# Make list of categories to keep, and drop excess columns:
to_keep = ['Social Category','Cultural/Linguistic Group','Cultural/Linguistic Group|Language Term',
           'Kin Term','Kin Term|Language Term','Language Term','Language Term|People','Language Term|Places',
           'Places''Social Category']
how_fis = how_fis[how_fis.Category.isin(to_keep)]
how_fis = how_fis[['Subject','Text']]

In [None]:
# How many of those pairs have non-identical spelling?
how_fis[how_fis.Subject.str.lower() != how_fis.Text.str.lower()].drop_duplicates()

## Section 2: Austkin data

In [None]:
austkin_path = 'austkin_out.csv'

with open(austkin_path, 'r', encoding = 'iso-8859-2') as file:
    austkin = pd.read_csv(file)

## Section 3: Chirila data

In [None]:
gam_path = 'gamilaraay_chirila_out.csv'
kur_path = 'kurnai_chirila_out.csv'

with open(gam_path, 'r', encoding = 'utf-8') as file:
    gam = pd.read_csv(file)
with open(kur_path, 'r', encoding = 'utf-8') as file:
    kur = pd.read_csv(file)

This data is not already in pairs. We need to generate the pairs...

### Postive pairs: group each dataframe by OriginalGloss, extract all different spellings

NB: sometimes multiple spellings are in a single cell, seperated by a comma.

Question: Are all words with the same gloss always the same word? Probably not always...

In [None]:
def generate_chirila_positive(chirila_data_frame, strictness = 0.7):
    """
    Takes a dataframe of chirla data, and generates positive pairs from it.
    
    params:
        chirila_data_frame: a pandas DataFrame created from a chirila csv.
        strictness: a number between 0 and 1, the threshold for the Levenshtein test. Defaults to 0.7.
        
    returns:
        chirila_positives = a pandas DataFrame of the training pairs
    """
    
    # Catch error with chirila_data_frame
    try:
        assert type(chirila_data_frame) == pd.core.frame.DataFrame
    except AssertionError:
        print("Oops! That's not a DataFrame!")
        return AssertionError
    
    # Catch error with strictness.
    if not 0 < strictness < 1:
        strictness = 0.7
        print("Strictness set outside allowable range. Reset to 0.7.")
    
    # Extract all words of the same OriginalGloss:
    positive_raw = [] # initialise empty list
    grp = chirila_data_frame[['OriginalForm', 'OriginalGloss']].groupby('OriginalGloss') # group data
    _ = grp.apply(lambda x: positive_raw.append(x['OriginalForm'].tolist())) # add to raw list
    
    # Initialise empty list for results:
    positive_pairs = []

    # Now loop through positive_raw and seperate all strings seperated by commas or semicolons
    
    for variants in positive_raw:
        
        new_list = []
        if len(variants) == 1:  # If there is only one word in the set ...
            continue            # ... skip it
        for string in variants:           # Loop over each string in this list
            if type(string) != str:
                continue
            string = re.sub(r'\(.+\)', ' ', string) # Remove parenthetical remarks
            string = re.sub(r'\\|\?|\*|\:|\(|\)', '', string) # Remove backslashes and question marks
            string = re.sub(f'\.', ' ', string) # Remove full stops
            spl = re.split(",|;| or ", string) # Split on commas, semicolons, or 'or'.
            for substring in spl:
                new_list.append(substring.strip()) # After splitting, add each individual word to new list
    
        # Now that each list has been properly tokenised, form all possible positive pairs.
        # We are only going to keep pairs whose Levenshtein distance is less than 0.7
        # We are also going to throw out identical words
        while len(new_list) > 1:
            next_word = new_list.pop() # Get next word and remove from list
            for word in new_list:      # Now loop over all the other words.
                if len(word) > 0:
                    dist = lev_norm(next_word, word)
                    new_pair = {'anchor':next_word, 'positive':word, 'pos_dist':dist} # ... add it and next_word as a training pair
                    positive_pairs.append(new_pair)
    
    chirila_positives = pd.DataFrame(positive_pairs)
    
    # Filter out unwanted rows
    strict = chirila_positives.pos_dist < strictness
    not_zero = chirila_positives.pos_dist > 0
    chirila_positives = chirila_positives[strict & not_zero]
    chirila_positives.drop_duplicates() # drop duplicates (why are they there??)

    return chirila_positives

In [None]:
gam_pos = generate_chirila_positive(gam, 0.7)
kur_pos = generate_chirila_positive(kur, 0.7)

In [None]:
def generate_chirila_negative(data, strictness = 0.5):
    """
    A more efficient algorithm for finding negative training pairs
    """
    
    # Turn off annoying warning:
    pd.options.mode.chained_assignment = None  # default='warn'
    
    # Get bits of data we want
    data = data[['OriginalForm','OriginalGloss']]
    data = data.dropna()
    
    # Strip special characters from the gloss column
    data['OriginalGloss'] = data['OriginalGloss'].str.replace('\W','')
    
    # Initialise accumulator
    out = pd.DataFrame(columns = ['anchor','anchor_gloss','neg_dist','OriginalForm','OriginalGloss'])
    
    print("Starting inner loop...")
    tick = time.perf_counter()
    for index, row in data.iterrows():
        
        if index % 500 == 0:
            tock = time.perf_counter()
            minutes = int((tock - tick) / 60)
            seconds = int(tock - tick) % 60
            print(f"Up to row {index}. {minutes} minutes and {seconds} seconds elapsed.")
        
        # Get important info
        anchor = row['OriginalForm']
        gloss = row['OriginalGloss']
        
        # Skip if anchor is unique
        if data[data.OriginalForm == anchor].shape[0] < 2:
            continue

        # Fix anchor word
        anchor = re.sub(r'\(.+\)', ' ', anchor) # Remove parenthetical remarks
        anchor = re.sub(r'\\|\?|\*|\:|\(|\)', '', anchor) # Remove backslashes and question marks
        anchor = re.sub(r'\.', ' ', anchor) # Remove full stops
        anchor = re.split(",|;| or ", anchor) # Split on commas, semicolons, or 'or'.
        anchor_list = []
        for substring in anchor:
            anchor_list.append(substring.strip()) # After splitting, add each individual word to new list

        for a in anchor_list:
            try:
                # Now set up search:
                regex = n_gram_regex(a)
                if type(regex) != re.Pattern: # Skip if no regex found
                    continue

                field = data[
                    ~data['OriginalGloss'].str.contains(gloss) & # keep words with different gloss
                    data['OriginalForm'].str.contains(regex) # keep words that match the regex
                ]
                
                # Skip if there are no possible matches
                if field.shape[0] == 0:
                    continue

                # Compute normalised Levenshtein distance with all of them
                field['neg_dist'] = field.apply(lambda x: lev_norm(a, x['OriginalForm']), axis = 1)

                # Filter according to Goldilocks
                field = field[field.neg_dist > 0.1] # Too close and they might actually be the same word
                field = field[field.neg_dist < strictness] # Too far and they will be too easy to distinguish

                # Keep unique forms and glosses
                field = field.groupby('OriginalForm', as_index = False).first()
                field = field.groupby('OriginalGloss', as_index = False).first()
                field['anchor'] = a
                field['anchor_gloss'] = gloss

                # Merge into output
                out = out.append(field, sort = False)
            except:
                return(a, index, regex, out, sys.exc_info())

    tock = time.perf_counter()
    minutes = int((tock - tick) / 60)
    seconds = int(tock - tick) % 60
    print(f"Inner loop complete. It took {minutes} minutes and {seconds} seconds.")
    out = out.rename(columns = {'OriginalForm':'neg_match'})
    out = out.rename(columns = {'OriginalGloss':'neg_gloss'})
    return out

In [None]:
gam_neg = generate_chirila_negative(gam, strictness = 0.5)

In [None]:
kur_neg = generate_chirila_negative(kur, strictness = 0.5)

In [None]:
print(f"{gam_pos.shape[0]} positive matches, and {gam_neg.shape[0]} negative matches were found for Gamilaraay.")
print(f"{kur_pos.shape[0]} positive matches, and {kur_neg.shape[0]} negative matches were found for Gunnaikurnai.\n")
# How many matches per anchor on average?
gam_avg = gam_neg.groupby(by = "anchor").size().mean()
kur_avg = kur_neg.groupby(by = "anchor").size().mean()
print(f"{gam_avg:.2f} negative matches were found for each Gamilaraay anchor on average.")
print(f"{kur_avg:.2f} negative matches were found for each Gunnaikurnai anchor on average.\n")

print(f"The mean Levenshtein distances (normalised) for Gamillaraay were:\nPositive examples: {gam_pos.pos_dist.mean():.3f}\nNegative examples: {gam_neg.neg_dist.mean():.3f}\n")
print(f"And for Gunnaikurnai:\nPositive examples: {kur_pos.pos_dist.mean():.3f}\nNegative examples: {kur_neg.neg_dist.mean():.3f}")


In [None]:
with open("gam_pos.p", "wb") as f:
    p.dump(gam_pos, f)
with open("gam_neg.p", "wb") as f:
    p.dump(gam_neg, f)
with open("kur_pos.p", "wb") as f:
    p.dump(kur_pos, f)
with open("kur_neg.p", "wb") as f:
    p.dump(kur_neg, f)

The moment of truth. Having generated data frames of positive and negative pairs, the time has come to join them...

In [None]:
def reduce_triples(data_frame, n = 30):
    """
    This function helps keep the size of the training set manageable, by only keeping the n-closest negative pairs.
    """
    df = data_frame.groupby(['anchor','positive'], as_index = False).apply(lambda x: x.nsmallest(30, 'neg_dist'))
    df = df.reset_index(drop = True)
    return df

In [None]:
gam_triples = pd.merge(gam_pos, gam_neg, how = 'outer')
gam_triples = gam_triples.dropna()

In [None]:
with open("gam_triples.p", "wb") as gam_p:
    p.dump(gam_triples, gam_p)

In [None]:
kur_triples = pd.merge(kur_pos, kur_neg, how = 'outer')
kur_triples = kur_triples.dropna()

In [None]:
with open("kur_triples.p", "wb") as kur_p:
    p.dump(kur_triples, kur_p)

In [None]:
# How did we do?
print(f"{len(gam_triples)} training triples were generated for Gamilaraay.")
print(f"{len(kur_triples)} were generated for Gunnaikurnai.")

# Section 4: Vectorise the strings

The final step is to create a vectoriser and vectorise all the strings in the corpus, so that it can be used for machine learning.

In [None]:
# Load data if in a new session
with open("gam_triples.p", "rb") as gam_p:
    gam_triples = p.load(gam_p)
with open("kur_triples.p", "rb") as kur_p:
    kur_triples = p.load(kur_p)

all_triples = gam_triples.append(kur_triples)

In [None]:
triples_reduced = reduce_triples(all_triples)

In [None]:
del all_triples

In [None]:
# Intialize tokenizer
one_hot_encoder = Tokenizer(filters = None, char_level = True)

# Fit to corpus
all_words = set(gam.OriginalForm.astype('str')).union(set(kur.OriginalForm.astype('str')))
one_hot_encoder.fit_on_texts(all_words)

In [None]:
# Convert to sequences and pad
anchor_seq = one_hot_encoder.texts_to_sequences(triples_reduced.anchor)
anchor_seq = pad_sequences(anchor_seq, maxlen = 10, padding = 'post')

positive_seq = one_hot_encoder.texts_to_sequences(triples_reduced.positive)
positive_seq = pad_sequences(positive_seq, maxlen = 10, padding = 'post')

negative_seq = one_hot_encoder.texts_to_sequences(triples_reduced.neg_match)
negative_seq = pad_sequences(negative_seq, maxlen = 10, padding = 'post')

In [None]:
# Join sequences into 3d array
data_tensor = np.stack([anchor_seq, positive_seq, negative_seq], axis = 1)

In [None]:
data_tensor.shape

In [None]:
# Get character dictionary from encoder
char_dict = one_hot_encoder.word_index

In [None]:
# Save into dictionary and export
out = {"data":data_tensor, "char_dict":char_dict}

with open("data.p", "wb") as f:
    p.dump(out, f)