# Experimenting with Conditional Random Fields

Let's see how well Dirko Coetsee's `pyhacrf` package does...

In [1]:
# Import necessary libraries

from pyhacrf import StringPairFeatureExtractor, Hacrf
import pandas as pd
import numpy as np
import pickle as p
from sklearn.model_selection import train_test_split

## Section 1: Get the data

In [140]:
# Get data
with open("gam_pos.p", "rb") as f:
    gam_pos = p.load(f)
with open("kur_pos.p", "rb") as f:
    kur_pos = p.load(f)
with open("gam_neg.p", "rb") as f:
    gam_neg = p.load(f)
with open("kur_neg.p", "rb") as f:
    kur_neg = p.load(f)

`pyhacrf` requires that the training data come in the form of two lists:

* x = a list of tuples, each of which contains two strings
* y = a list of strings, indicating whether the tuples are a 'match' or 'non-match'

To begin with, I reserve the Gamilaraay data as test data, to see if a model trained on one aboriginal language can do well on another.

In [40]:
# Keep five random negative matches for each anchor.
def keep_n(df, n = 5):
    """
    A little helper function that keeps n randomly selected rows from a data frame.
    Can be used with DataFrame.groupby.apply() to keep n rows from arbitarily defined
    groups of a data frame.
    
    params:
        df: a Pandas DataFrame
        n: the number of rows to keep
    
    returns:
        selection: a shorter DataFrame with the required number of rows
    """
    
    # Reset n if too large for this DataFrame:
    if n > len(df):
        n = len(df)
    
    # Randomly choose which rows to keep
    rows_to_keep = np.random.choice(range(len(df)), n, replace = False)
    
    # Keep them
    selection = df.iloc[rows_to_keep, :]
    
    return selection

In [39]:
np.random.choice(range(3), 1, replace = False)

array([1])

In [141]:
# Reshape training data
x = []
y = []

# Just get juiciest training examples.
gam_pos = gam_pos[
    (gam_pos.pos_dist > 0.4) & # more than .4 apart in normalised Levenshtein (to avoid being too similar)
    (gam_pos.pos_dist < 0.5) & # no more than .5 apart (to avoid false positives)
    (gam_pos.anchor.str.len() < 10) # no more than 10 characters long (to avoid junk entries)
]
gam_pos = gam_pos.sample(frac = 1).reset_index(drop = True) # Shuffle training examples 
pos_iter = gam_pos[['anchor','positive']].itertuples(index = False, name = None) # Create iterator

x += list(pos_iter) # Add to x list
y += ['match' for i in range(len(gam_pos))] # Generate appropriate y labels

# Get the juiciest negative examples
gam_neg = gam_neg[
    (gam_neg.neg_dist > 0.4) & # more than .4 apart to ensure no false negatives
    (gam_neg.neg_dist < 0.44) # no more than .44 apart to ensure that they are no too dissimilar
]
gam_neg = gam_neg.groupby('anchor').apply(keep_n, n = 1) # just keep one random example per anchor word
gam_neg = gam_neg.sample(frac = 1).reset_index(drop = True) # Shuffle training examples 
neg_iter = gam_neg[['anchor','neg_match']].itertuples(index = False, name = None)
x += list(neg_iter) # Add to x list
y += ['non-match' for i in range(len(gam_neg))]

In [142]:
# Applying those conditions has created a roughly equal number of positive and negative training examples:
print(f"There are {len(gam_pos)} positive examples and {len(gam_neg)} negative ones.")

There are 881 positive examples and 947 negative ones.


## Section 2: Train the model...

In [143]:
# Create charset
# Join all the tuples into one list
one_list = [''.join(list(x)) for x in x]
# Put in lower case
lowered = [x.lower() for x in one_list]
# Join into single string and set
charset = set(''.join(lowered))

In [156]:
# Extract features
feature_extractor = StringPairFeatureExtractor(match=True, transition=True, charset = charset)
x_extracted = feature_extractor.fit_transform(x)

# Train-test split
# Data has already been shuffled
pos_split = int(np.ceil(len(gam_pos) * 0.8)) # Where is the 80th percentile in the positive examples?
pos_end = len(gam_pos)
neg_split = int(np.ceil(len(gam_neg) * 0.8)) + len(gam_pos) # Where is the 80th percentile in the negative ones?
neg_end = len(x)

# Take equal portions of the positive and negative examples
x_train = x_extracted[0:pos_split] + x_extracted[pos_end:neg_split]
y_train = y[0:pos_split] + y[pos_end:neg_split]

x_test = x_extracted[pos_split:pos_end] + x_extracted[neg_split:neg_end]
y_test = y[pos_split:pos_end] + y[neg_split:neg_end]

# Dimensions
print(f"x_train: {len(x_train)}\ny_train: {len(y_train)}\nx_test: {len(x_test)}\ny_test: {len(y_test)}")

x_train: 1463
y_train: 1463
x_test: 365
y_test: 365


Now each training example consists of a rank-3 tensor. The rows represent the first string in the training pair, the columns the second string, and in each position is a ~1800-dimensional vector representing which characters have been switched for which in each place.

In [157]:
x_train[0].shape

(4, 7, 1683)

In [158]:
# Train model
model = Hacrf(l2_regularization=1.0)
model.fit(x_train, y_train, verbosity = 3)

Iteration  Log-likelihood |gradient|
         0 -1.014e+03  7.996e+03
         3  -1.01e+03  6.481e+03
         6     -974.3   5.84e+03
         9     -836.6  3.738e+03
        12 -1.032e+03  3.882e+04
        15     -698.5  2.439e+03
        18     -639.6  1.817e+03
        21     -630.3  2.765e+03
        24     -620.4  1.998e+03
        27     -661.0  3.354e+03
        30     -589.5  2.382e+03
        33     -580.0  1.705e+03
        36     -560.5  1.525e+03
        39     -557.6  1.127e+03
        42     -556.0  1.587e+03
        45     -551.5  1.217e+03
        48     -549.8   1.07e+03
        51     -548.1   1.37e+03
        54     -538.4  1.045e+03
        57     -536.9  1.064e+03
        60     -536.1  1.093e+03
        63     -533.4  3.222e+03
        66     -530.5      970.1
        69     -525.4      900.8
        72     -524.3  2.731e+03
        75     -521.4      936.7
        78     -519.4      841.9
        81     -517.8  1.045e+03
        84     -516.1  1.027e+03
      

       744     -473.0      110.7
       747     -473.0      70.55
       750     -473.0      64.18
       753     -473.0      74.72
       756     -473.0      212.5
       759     -472.9      52.81
       762     -472.9      46.94
       765     -473.2  1.039e+03
       768     -472.9      76.17
       771     -472.9      50.43
       774     -472.9      64.99
       777     -472.9      55.74
       780     -472.9      54.95
       783     -472.9      61.52
       786     -472.8      57.44
       789     -472.9      269.4
       792     -472.8      42.19
       795     -472.8      41.12
       798     -475.1  2.622e+03
       801     -472.8      48.39
       804     -472.8      138.4
       807     -472.8      77.53
       810     -472.8       55.6
       813     -472.8      114.9
       816     -472.8      44.36
       819     -472.8      58.29
       822     -472.8      87.95
       825     -472.8      59.05
       828     -472.8      42.89
       831     -472.7      42.99
       834

      1491     -472.4      2.546
      1494     -472.4      2.064
      1497     -472.4      3.388
      1500     -472.4      2.297
      1503     -472.4      2.108
      1506     -472.4      4.142
      1509     -472.4      1.935
      1512     -472.4      1.629
      1515     -472.4      2.096
      1518     -472.4      1.722
      1521     -472.4      3.932
      1524     -472.4       1.81
      1527     -472.4      1.521
      1530     -472.4      1.139
      1533     -472.4      1.336
      1536     -472.4     0.9482
      1539     -472.4      1.012
      1542     -472.4      1.416
      1545     -472.4      1.145
      1548     -472.4      0.848
      1551     -472.4      1.431
      1554     -472.4      1.255
      1557     -472.4     0.8181
      1560     -472.4      1.287
      1563     -472.4      1.201
      1566     -472.4     0.7484
      1569     -472.4     0.9212
      1572     -472.4      1.001
      1575     -472.4     0.7641
      1578     -472.4      6.724
      1581

      2238     -472.4    0.01065
      2241     -472.4     0.0108
      2244     -472.4    0.04786
      2247     -472.4   0.008558
      2250     -472.4    0.01147
      2253     -472.4   0.009365
      2256     -472.4    0.00753
      2259     -472.4   0.009133
      2262     -472.4    0.07507
      2265     -472.4   0.005569
      2268     -472.4     0.1264
      2271     -472.4   0.006693
      2274     -472.4   0.006486
      2277     -472.4   0.004896
      2280     -472.4   0.006474
      2283     -472.4   0.006788
      2286     -472.4    0.00692
      2289     -472.4   0.006815
      2292     -472.4   0.004765
      2295     -472.4   0.007478
      2298     -472.4   0.005512
      2301     -472.4   0.005916
      2304     -472.4   0.003814
      2307     -472.4    0.00396
      2310     -472.4   0.003946
      2313     -472.4   0.003837
      2316     -472.4   0.004067
      2319     -472.4   0.005478
      2322     -472.4   0.005956
      2325     -472.4   0.004003
      2328

<pyhacrf.pyhacrf.Hacrf at 0x1166ec7b8>

In [182]:
# How does the model do on the test set?

# Generate a dictionary mapping class names onto numbers
class_dict = {value:key for key,value in enumerate(model.classes)}

# Get predictions on test set:
test_pred = model.predict_proba(x_test) # Probabilities output by model
test_pred = np.argmax(test_pred, axis = 1) # Max probability = predicted class

# Apply dict to y_test
test_classes = np.array([class_dict[x] for x in y_test], dtype='int64')

# Now compare test data to predictions
correct = test_pred == test_classes
acc = correct.sum() / len(correct)

true_positives = (test_pred == True) & (test_classes == True)
false_positives = (test_pred == True) & (test_classes == False)
false_negatives = (test_pred == False) & (test_classes == True)

# Precision:
precision = true_positives.sum() / (true_positives.sum() + false_positives.sum())
recall = true_positives.sum() / (true_positives.sum() + false_negatives.sum())

# The moment of truth:
print(f"The model has achieved {acc*100:.0f}% accuracy, with a precision of {precision:.2f} and a recall of {recall:.2f}.")

The model has achieved 80% accuracy, with a precision of 0.84 and a recall of 0.75.


In [186]:
gk = feature_extractor.transform([("kookaburra","gooburra")])
model.predict_proba(gk)

array([[0.21493635, 0.78506365]])

In [188]:
# Save the key stuff.
to_save = {'feature_extractor':feature_extractor, 'model':model}

with open('crf_trained.p', 'wb') as file:
    p.dump(to_save, file)