## Everything Included in Cli:

In [9]:
import re
import decimal
import pandas as pd

import sklearn_crfsuite # Had to install
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import joblib

import cli
import utils

In [2]:
data_path = "nyt-ingredients-snapshot-2015.csv"

In [3]:
# Generate data
training_size = 20000
testing_size = 2000

X_train, y_train = cli.generate_data(data_path, training_size, 0)
X_test, y_test = cli.generate_data(data_path, testing_size, training_size)

In [4]:
# View example training data point and label
X_train[0]

[{'0': '1$1/4', '1': 'I1', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'cups', '1': 'I2', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'cooked', '1': 'I3', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'and', '1': 'I4', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'pureed', '1': 'I5', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'fresh', '1': 'I6', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'butternut', '1': 'I7', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'squash', '1': 'I8', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': ',', '1': 'I9', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'or', '1': 'I10', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': '1', '1': 'I11', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': '10', '1': 'I12', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'ounce', '1': 'I13', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'package', '1': 'I14', '2': 'L4', '3': 'NoCAP', '4': 'NoPAREN'},
 {'0': 'frozen', '1': 'I15', '2': 'L4

In [6]:
y_train[0]

['B-QTY',
 'B-UNIT',
 'B-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'B-NAME',
 'I-NAME',
 'OTHER',
 'I-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'B-NAME',
 'OTHER',
 'I-COMMENT']

## Training CRF model

This code was adapted from the helpful tutorial at: https://github.com/TeamHG-Memex/sklearn-crfsuite/blob/master/docs/CoNLL2002.ipynb

In [10]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: total: 5.23 s
Wall time: 12.3 s


In [48]:
labels = list(crf.classes_)
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


np.float64(0.8287527222685849)

In [49]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

   B-COMMENT      0.702     0.771     0.735      1536
   I-COMMENT      0.773     0.885     0.825      2999
     B-INDEX      0.000     0.000     0.000         0
      B-NAME      0.836     0.830     0.833      2114
      I-NAME      0.835     0.680     0.749      1474
       B-QTY      0.983     0.978     0.980      1728
       I-QTY      1.000     1.000     1.000         2
 B-RANGE_END      0.933     0.903     0.918        31
      B-UNIT      0.982     0.980     0.981      1480
       OTHER      0.713     0.483     0.576       896

    accuracy                          0.832     12260
   macro avg      0.776     0.751     0.760     12260
weighted avg      0.833     0.832     0.829     12260



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Storing the model using joblib

In [11]:
# Save model
joblib.dump(crf, "crf_model.pkl")

['crf_model.pkl']