In [2]:
%load_ext autoreload
%autoreload 2

# Long input strings

For certain tasks it might make more sense to tokenize input strings first and then extract features on these string lists rather than on the original character lists.

To demonstrate this I'll take some example strings from [highered](https://github.com/datamade/highered/) and learn models using these two feature extraction techniques.

## Training examples

In [3]:
X = [(u'caring hands a step ahead', u'el valor little tykes ii'),
  (u'dulles', u"chicago public schools o'keeffe, isabell c."),
  (u'erie neighborhood house fcch-carmen l. vega site',
   u'erie neighborhood house fcch-servia galva site'),
  (u'chicago public schools dvorak math & science tech academy, anton',
   u'chicago public schools perez, manuel'),
  (u'v & j day care center', u"henry booth house granny's day care center"),
  (u'home of life community dev. corp. - home of life just for you',
   u'urban family and community centers'),
  (u'carole robertson center for learning fcch-ileana gonzalez',
   u'carole robertson center for learning fcch-rhonda culverson'),
  (u'bethel new life bethel child development',
   u'mary crane league mary crane center (lake & pulaski)'),
  (u'easter seals society of metropolitan chicago - stepping stones early/childhood lear',
   u"marcy newberry association kenyatta's day care"),
  (u'westside holistic family services westside holistic family services',
   u'childserv lawndale'),
  
  (u'higgins', u'higgins'),
  (u'ymca south side', u'ymca of metropolitan chicago - south side ymca'),
  (u'chicago commons association paulo freire',
   u'chicago commons association paulo freire'),
  (u'fresh start daycare, inc.',
   u'easter seals society of metropolitan chicago fresh start day care center'),
  (u'el valor teddy bear 3', u'teddy bear 3'),
  (u'chicago child care society chicago child care society',
   u'chicago child care society-child and family dev center'),
  (u'hull house - uptown', u'uptown family care center')]
Y = [u'distinct',
  u'distinct',
  u'distinct',
  u'distinct',
  u'distinct',
  u'distinct',
  u'distinct',
  u'distinct',
  u'distinct',
  u'distinct',
  u'match',
  u'match',
  u'match',
  u'match',
  u'match',
  u'match',
  u'match']

In [4]:
from pyhacrf import StringPairFeatureExtractor, Hacrf
from scipy.optimize import fmin_l_bfgs_b
import numpy as np

## Character level features

In [5]:
# Extract features
feature_extractor = StringPairFeatureExtractor(match=True, numeric=True)
X_extracted = feature_extractor.fit_transform(X)

In [10]:
#%%timeit -n1 -r1
# Train model
model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 10})
model.fit(X_extracted, Y, verbosity=1)

Iteration  Log-likelihood |gradient|
         0     -11.78      652.6
         1     -609.0  1.575e+03
         2     -54.72  1.571e+03
         3     -11.31      563.1
         4     -10.83      144.8
         5     -10.78      120.7
         6      -10.7      146.0
         7     -10.43      252.0
         8     -10.13      331.2
         9     -9.795      253.3
        10      -9.57      104.9


<pyhacrf.pyhacrf.Hacrf at 0x7fe671b7af28>

In [11]:
#%%timeit -n1 -r1
# Evaluate
from sklearn.metrics import confusion_matrix
predictions = model.predict(X_extracted)
print(confusion_matrix(Y, predictions))
print(model.predict_proba(X_extracted))


[[2 8]
 [3 4]]
[[0.6172364  0.3827636 ]
 [0.31317953 0.68682047]
 [0.62107204 0.37892796]
 [0.85327835 0.14672165]
 [0.43888254 0.56111746]
 [0.85518886 0.14481114]
 [0.66413495 0.33586505]
 [0.59691953 0.40308047]
 [0.9149792  0.0850208 ]
 [0.91642262 0.08357738]
 [0.48684886 0.51315114]
 [0.3672367  0.6327633 ]
 [0.56645306 0.43354694]
 [0.32613658 0.67386342]
 [0.62561551 0.37438449]
 [0.65620343 0.34379657]
 [0.55173875 0.44826125]]


## Token level features

In [12]:
from pyhacrf import PairFeatureExtractor

In [13]:
tokX = [[sentence.split(' ') for sentence in pair] for pair in X]

In [14]:
real = [
    lambda i, j, s1, s2: 1.0,
    lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] else 0.0,
    lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] and len(s1[i]) >= 6 else 0.0,
    lambda i, j, s1, s2: 1.0 if s1[i].isdigit() and s2[j].isdigit() and s1[i] == s2[j] else 0.0,
    lambda i, j, s1, s2: 1.0 if s1[i].isalpha() and s2[j].isalpha() and s1[i] == s2[j] else 0.0,
    lambda i, j, s1, s2: 1.0 if not s1[i].isalpha() and not s2[j].isalpha() else 0.0
]
# Other ideas are:
#  to look up whether words are dictionary words,
#  longest common subsequence,
#  standard edit distance
feature_extractor = PairFeatureExtractor(real=real)
X_extracted = feature_extractor.fit_transform(tokX)

TypeError: <lambda>() missing 2 required positional arguments: 's1' and 's2'

In [None]:
#%%timeit -n1 -r1
# Train model
model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 400})
model.fit(X_extracted, Y, verbosity=10)

In [None]:
%%timeit -n1 -r1
# Evaluate
from sklearn.metrics import confusion_matrix
predictions = model.predict(X_extracted)
print(confusion_matrix(Y, predictions))
print(model.predict_proba(X_extracted))

## Edit distance and word frequency features

Let's also add the the Levenschtein distance as a features. 

When we peek at the training examples, it looks as if less common words should be more informative of a match - let's add a feature for the word frequency as well.

In [None]:
import editdistance

In [None]:
editdistance.eval('cheese', 'kaas')

In [None]:
tokX = [[sentence.split(' ') for sentence in pair] for pair in X]

In [None]:
real = [
    lambda i, j, s1, s2: 1.0,
    lambda i, j, s1, s2: 1.0 if s1[i] == s2[j] else 0.0,
    lambda i, j, s1, s2: 1.0 if s1[i].isdigit() and s2[j].isdigit() and s1[i] == s2[j] else 0.0,
    lambda i, j, s1, s2: 1.0 if not s1[i].isalpha() and not s2[j].isalpha() else 0.0,
    lambda i, j, s1, s2: editdistance.eval(s1[i], s2[j]),
    lambda i, j, s1, s2: np.log(editdistance.eval(s1[i], s2[j]) + 1),
    lambda i, j, s1, s2: (editdistance.eval(s1[i], s2[j])) / max(len(s1[i]), len(s2[j])),
    lambda i, j, s1, s2: 1.0 - (editdistance.eval(s1[i], s2[j])) / max(len(s1[i]), len(s2[j]))
]
# Other ideas are:
#  to look up whether words are dictionary words,
#  longest common subsequence,
#  standard edit distance

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split

In [None]:
# Train model
errors_val = []
errors_train = []
for i, featureset in enumerate([[0, 1],
                                [0, 1, 2],
                                [0, 1, 2, 3],
                                [0, 4], 
                                [0, 1, 4], 
                                [0, 1, 2, 3, 4],
                                [0, 5],
                                [0, 1, 5],
                                [0, 1, 2, 3, 5],
                                [0, 6],
                                [0, 1, 6],
                                [0, 1, 2, 3, 6],
                                [0, 7],
                                [0, 1, 7],
                                [0, 1, 2, 3, 7]]):
    print '{:4}{:18}'.format(i, featureset),
    errs_val = []
    errs_train = []
    for repeat in xrange(15):
        x_train, x_val, y_train, y_val = train_test_split(tokX, Y, test_size=0.2)
        feature_extractor = PairFeatureExtractor(real=[real[f] for f in featureset])
        X_extracted = feature_extractor.fit_transform(x_train)

        model = Hacrf(l2_regularization=1.0, optimizer=fmin_l_bfgs_b, optimizer_kwargs={'maxfun': 400})
        model.fit(X_extracted, y_train)
        
        predictions = model.predict(X_extracted)
        err_train = 1.0 - accuracy_score(y_train, predictions)
        
        X_extracted = feature_extractor.transform(x_val)
        predictions = model.predict(X_extracted)
        err_val = 1.0 - accuracy_score(y_val, predictions)
        if repeat % 10 == 0:
            print '{:.2f}'.format(err_train),
            print '{:.2f}'.format(err_val),
        errs_val.append(err_val)
        errs_train.append(err_train)
    print '  => {:.2f} +- {:.2f} | {:.2f} +- {:.2f}'.format(np.average(errs_train), 
                                                            np.std(errs_train),
                                                            np.average(errs_val), 
                                                            np.std(errs_val))
    errors_train.append(errs_train)
    errors_val.append(errs_val)

## Conclusion

It seems that tokenising the text not only speeds up training and scoring by 40x, it also improves the predictions. We definitely need more data to do this properly though.