# Experimenting with Conditional Random Fields

Let's see how well Dirko Coetsee's `pyhacrf` package does...

In [1]:
# Import necessary libraries

from pyhacrf import StringPairFeatureExtractor, Hacrf
import pandas as pd
import numpy as np
import pickle as p
from sklearn.model_selection import train_test_split

## Section 1: Get the data

In [140]:
# Get data
with open("gam_pos.p", "rb") as f:
    gam_pos = p.load(f)
with open("kur_pos.p", "rb") as f:
    kur_pos = p.load(f)
with open("gam_neg.p", "rb") as f:
    gam_neg = p.load(f)
with open("kur_neg.p", "rb") as f:
    kur_neg = p.load(f)

`pyhacrf` requires that the training data come in the form of two lists:

* x = a list of tuples, each of which contains two strings
* y = a list of strings, indicating whether the tuples are a 'match' or 'non-match'

To begin with, I reserve the Gamilaraay data as test data, to see if a model trained on one aboriginal language can do well on another.

In [40]:
# Keep five random negative matches for each anchor.
def keep_n(df, n = 5):
    """
    A little helper function that keeps n randomly selected rows from a data frame.
    Can be used with DataFrame.groupby.apply() to keep n rows from arbitarily defined
    groups of a data frame.
    
    params:
        df: a Pandas DataFrame
        n: the number of rows to keep
    
    returns:
        selection: a shorter DataFrame with the required number of rows
    """
    
    # Reset n if too large for this DataFrame:
    if n > len(df):
        n = len(df)
    
    # Randomly choose which rows to keep
    rows_to_keep = np.random.choice(range(len(df)), n, replace = False)
    
    # Keep them
    selection = df.iloc[rows_to_keep, :]
    
    return selection

In [39]:
np.random.choice(range(3), 1, replace = False)

array([1])

In [141]:
# Reshape training data
x = []
y = []

# Just get juiciest training examples.
gam_pos = gam_pos[
    (gam_pos.pos_dist > 0.4) & # more than .4 apart in normalised Levenshtein (to avoid being too similar)
    (gam_pos.pos_dist < 0.5) & # no more than .5 apart (to avoid false positives)
    (gam_pos.anchor.str.len() < 10) # no more than 10 characters long (to avoid junk entries)
]
gam_pos = gam_pos.sample(frac = 1).reset_index(drop = True) # Shuffle training examples 
pos_iter = gam_pos[['anchor','positive']].itertuples(index = False, name = None) # Create iterator

x += list(pos_iter) # Add to x list
y += ['match' for i in range(len(gam_pos))] # Generate appropriate y labels

# Get the juiciest negative examples
gam_neg = gam_neg[
    (gam_neg.neg_dist > 0.4) & # more than .4 apart to ensure no false negatives
    (gam_neg.neg_dist < 0.44) # no more than .44 apart to ensure that they are no too dissimilar
]
gam_neg = gam_neg.groupby('anchor').apply(keep_n, n = 1) # just keep one random example per anchor word
gam_neg = gam_neg.sample(frac = 1).reset_index(drop = True) # Shuffle training examples 
neg_iter = gam_neg[['anchor','neg_match']].itertuples(index = False, name = None)
x += list(neg_iter) # Add to x list
y += ['non-match' for i in range(len(gam_neg))]

In [142]:
# Applying those conditions has created a roughly equal number of positive and negative training examples:
print(f"There are {len(gam_pos)} positive examples and {len(gam_neg)} negative ones.")

There are 881 positive examples and 947 negative ones.


## Section 2: Train the model...

In [143]:
# Create charset
# Join all the tuples into one list
one_list = [''.join(list(x)) for x in x]
# Put in lower case
lowered = [x.lower() for x in one_list]
# Join into single string and set
charset = set(''.join(lowered))

In [156]:
# Extract features
feature_extractor = StringPairFeatureExtractor(match=True, transition=True, charset = charset)
x_extracted = feature_extractor.fit_transform(x)

# Train-test split
# Data has already been shuffled
pos_split = int(np.ceil(len(gam_pos) * 0.8)) # Where is the 80th percentile in the positive examples?
pos_end = len(gam_pos)
neg_split = int(np.ceil(len(gam_neg) * 0.8)) + len(gam_pos) # Where is the 80th percentile in the negative ones?
neg_end = len(x)

# Take equal portions of the positive and negative examples
x_train = x_extracted[0:pos_split] + x_extracted[pos_end:neg_split]
y_train = y[0:pos_split] + y[pos_end:neg_split]

x_test = x_extracted[pos_split:pos_end] + x_extracted[neg_split:neg_end]
y_test = y[pos_split:pos_end] + y[neg_split:neg_end]

# Dimensions
print(f"x_train: {len(x_train)}\ny_train: {len(y_train)}\nx_test: {len(x_test)}\ny_test: {len(y_test)}")

x_train: 1463
y_train: 1463
x_test: 365
y_test: 365


Now each training example consists of a rank-3 tensor. The rows represent the first string in the training pair, the columns the second string, and in each position is a ~1800-dimensional vector representing which characters have been switched for which in each place.

In [157]:
x_train[0].shape

(4, 7, 1683)

In [None]:
# Train model
model = Hacrf(l2_regularization=1.0)
model.fit(x_train, y_train, verbosity = 3)