# Prepare Data

In [1]:
df_train = pd.read_csv("C:/Users/ac/OneDrive/Desktop/Projects/Project 9/train.csv")
df_test = pd.read_csv("C:/Users/ac/OneDrive/Desktop/Projects/Project 9/test.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
df_train.head()

Unnamed: 0,source,ingredient_id,token_id,token,label
0,ar,0,0,4,QUANTITY
1,ar,0,1,cloves,UNIT
2,ar,0,2,garlic,NAME
3,ar,1,0,2,QUANTITY
4,ar,1,1,tablespoons,UNIT


In [11]:
def data_filename(source, split):
    return f'{source}_{split}.tsv'

Let's combine the training and test data to make the transformation simpler.

In [12]:
df = pd.concat([df_train.assign(split='train'), df_test.assign(split='test')], axis=0, ignore_index=True)
df

Unnamed: 0,source,ingredient_id,token_id,token,label,split
0,ar,0,0,4,QUANTITY,train
1,ar,0,1,cloves,UNIT,train
2,ar,0,2,garlic,NAME,train
3,ar,1,0,2,QUANTITY,train
4,ar,1,1,tablespoons,UNIT,train
...,...,...,...,...,...,...
49909,gk,1703,10,-RRB-,O,test
49910,gk,1704,0,1,QUANTITY,test
49911,gk,1704,1,sprig,O,test
49912,gk,1704,2,celery,NAME,test


In [13]:
assert not df.duplicated(['source', 'ingredient_id', 'token_id', 'split']).any()

To add the blank line between ingredients we'll insert a blank at token_id -1.

In [14]:
df_spacer = (
    df[['source', 'ingredient_id', 'split']]
    .drop_duplicates()
    .assign(token_id=-1,
            label='',
            token='')
)

In [15]:
df_out = (
    pd.concat([df_spacer, df], axis=0, ignore_index=True)
    .sort_values(['source', 'split', 'ingredient_id', 'token_id'])
)

It contains quotes and so CSV quote escaping will do some funny things.

In [16]:
df_out.token.str.contains('"').any()

True

It doesn't contain `_`, so let's use that as the quote character.

In [17]:
df_out.token.str.contains('_').any()

False

In [18]:
for source in ['ar', 'gk']:
    for split in ['train', 'test']:
        (df_out
        .query('split == @split & source == @source')
        .to_csv(data_filename(source, split),
                sep='\t',
                header=False,
                index=False,
                quotechar='_',
                columns=['token', 'label'])
        )

Check output sizes

In [19]:
df_out.groupby(['source', 'split'])['token'].count()

source  split
ar      test      3271
        train     9682
gk      test     11591
        train    34169
Name: token, dtype: int64

In [20]:
!wc -l *.tsv

  3271 ar_test.tsv
  9682 ar_train.tsv
 11591 gk_test.tsv
 34170 gk_train.tsv
 58714 total


Check there are no _ in the output (i.e. quoting wasn't used).

In [21]:
!grep '_' *.tsv

Each line of the file is either a single tab (separating different texts), or a token followed by a tab and then the entity type.

So for example the first ingredient is `4 cloves garlic`, which is a quantity (4) followed by a unit (cloves) and a name (garlic).

In [22]:
!head {data_filename('ar', 'train')} | cat -t

^I
4^IQUANTITY
cloves^IUNIT
garlic^INAME
^I
2^IQUANTITY
tablespoons^IUNIT
vegetable^INAME
oil^INAME
,^IO


# Train NER Model

Now we want to train a Stanford NER model on the new annotations.

First we have to configure it; but there's no information on the paper on how it's configured.
I've copied this template configuration out of the [FAQ](https://nlp.stanford.edu/software/crf-faq.html)
For more information on the parameters you can check the [NERFeatureFactory documentation](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/NERFeatureFactory.html) or the [source](https://github.com/stanfordnlp/CoreNLP/blob/main/src/edu/stanford/nlp/ie/NERFeatureFactory.java).

In [23]:
from typing import List

def ner_prop_str(train_files: List[str], test_files: List[str], output: str) -> str:
    """Returns configuration string to train NER model"""
    train_file_str = ','.join(train_files)
    test_file_str = ','.join(test_files)
    return f"""
trainFileList = {train_file_str}
testFiles = {test_file_str}
serializeTo = {output}
map = word=0,answer=1

useClassFeature=true
useWord=true
useNGrams=true
noMidNGrams=true
maxNGramLeng=6
usePrev=true
useNext=true
useSequences=true
usePrevSequences=true
maxLeft=1
useTypeSeqs=true
useTypeSeqs2=true
useTypeySequences=true
wordShape=chris2useLC
useDisjunctive=true
"""

This is expected to be a file, so let's write a helper that writes it to a file. (An alternative would be to pass these as arguments to the trainer).

In [24]:
def write_ner_prop_file(ner_prop_file: str, train_files: List[str], test_files: List[str], output_file: str) -> None:
    with open(ner_prop_file, 'wt') as f:
        props = ner_prop_str(train_files, test_files, output_file)
        f.write(props)

Stanza doesn't give an interface to train a CRF NER model using Stanford NLP, but we can invoke `edu.stanford.nlp.ie.crf.CRFClassifier` directly.

Let's write a properties file and invoke Java to run the classifier.
It prints a lot of training information, and importantly a summary report at the end which we want to see.

In [25]:
import subprocess
from typing import List

def train_model(model_name, train_files: List[str], test_files: List[str], print_report=True, classpath=classpath) -> str:
    """Trains CRF NER Model using StanfordNLP"""
    model_file = f'{model_name}.model.ser.gz'
    ner_prop_filename = f'{model_name}.model.props'
    write_ner_prop_file(ner_prop_filename, train_files, test_files, model_file)
        
    result = subprocess.run(
                ['java',
                 '-Xmx2g',
                 '-cp', classpath,
                 'edu.stanford.nlp.ie.crf.CRFClassifier',
                 '-prop', ner_prop_filename],
                capture_output=True)
    
    # If there's an error with invocation better log the stacktrace
    if result.returncode != 0:
        print(result.stderr.decode('utf-8'))
    result.check_returncode()
    
    if print_report:
        print(*result.stderr.decode('utf-8').split('\n')[-11:], sep='\n')
        
    return model_file

We can train models on each dataset separately, and all together.
For evaluation we'll use the corresponding test set.

This only takes a few minutes.

In [26]:
%%time

models = {}
for source in ['ar', 'gk', 'ar_gk']:
    print(source)
    train_files = [data_filename(s, 'train') for s in source.split('_')]
    test_files = [data_filename(s, 'test') for s in source.split('_')]
    models[source] = train_model(source, train_files, test_files)
    print()

ar
CRFClassifier tagged 2788 words in 483 documents at 8057.80 words per second.
         Entity	P	R	F1	TP	FP	FN
             DF	1.0000	0.9608	0.9800	49	0	2
           NAME	0.9297	0.9279	0.9288	463	35	36
       QUANTITY	1.0000	0.9962	0.9981	522	0	2
           SIZE	1.0000	1.0000	1.0000	20	0	0
          STATE	0.9601	0.9633	0.9617	289	12	11
           TEMP	0.8750	0.7000	0.7778	7	1	3
           UNIT	0.9819	0.9841	0.9830	434	8	7
         Totals	0.9696	0.9669	0.9682	1784	56	61


gk
CRFClassifier tagged 9886 words in 1705 documents at 10528.22 words per second.
         Entity	P	R	F1	TP	FP	FN
             DF	0.9718	0.9517	0.9617	138	4	7
           NAME	0.9143	0.9026	0.9084	1622	152	175
       QUANTITY	0.9882	0.9870	0.9876	1598	19	21
           SIZE	0.9750	0.9398	0.9571	78	2	5
          STATE	0.9255	0.9503	0.9377	708	57	37
           TEMP	0.8125	0.8125	0.8125	26	6	6
           UNIT	0.9810	0.9729	0.9769	1292	25	36
         Totals	0.9537	0.9501	0.9519	5462	265	287


ar_gk
CRFClassifier tagged 12

The summary report shows for each model and entity type:

* True Positives (TP): The number of times that entity was predicted correctly
* False Positives (FP): The number of times that entity in the text but not predicted correctly
* False Negative (FN): The number of times that entity was not in the text and predicted
* Precision (P): Probability a predicted entity is correct, TP/(TP+FP)
* Recall (R): Probability a correct entity is predicted, TP/(TP+FN)
* F1 Score (F1): Harmonic mean of precision and recall, 2/(1/P + 1/R).

We can compare the F1 Totals to the diagonal of Table IV in the paper

* AllRecipes.com (ar): We get 0.9682, they report 0.9682
* FOOD.com (gk): We get 0.9516, they report 0.9519
* Both (ar_gk): We get 0.9551, they report 0.9611

These are super close.
The furthest is `ar_gk` and in the repository they have a separate `ar_gk_train.tsv`; it would be interesting to check whether using it directly gives a closer result and why there is a difference.

# Running the model in Python

We can now use these trained models in Python by invoking Stanford NLP with Stanza.

We can call StanfordNLP with our custom model by passing the property `ner.model`.

Our test data is already tokenized in a different way to StanfordNLP, so we'll add an option to the [Tokenizer](https://stanfordnlp.github.io/CoreNLP/tokenize.html) to use whitespace tokenization which is easy to invert.

It takes a while to start up the server so we want to annotate a large number of texts at once.

In [27]:
from tqdm.notebook import tqdm
from stanza.server import CoreNLPClient

def annotate_ner(ner_model_file: str, texts: List[str], tokenize_whitespace: bool = True):
    properties = {"ner.model": ner_model_file, "tokenize.whitespace": tokenize_whitespace, "ner.applyNumericClassifiers": False}
    
    annotated = []
    with CoreNLPClient(
         annotators=['tokenize','ssplit','ner'],
         properties=properties,
         timeout=30000,
         be_quiet=True,
        memory='6G') as client:
    
        for text in tqdm(texts):
            annotated.append(client.annotate(text))
    return annotated

We can then get the annotations.

Note that there's a bit of overhead in starting the server, but then the exmaples are annotated quickly.

In [28]:
annotations = annotate_ner(models['ar'],
                           ['1 cup of frozen peas',
                            'A dash of salt . Or to taste',
                           '12 slices pancetta -LRB- Italian unsmoked cured bacon -RRB-',
                           'pumpkin sliced into 3 cm moons'])

  0%|          | 0/4 [00:00<?, ?it/s]

Note here that the word "Italian" has ner "NATIONALITY", which comes from another model (it wasn't in the training set!).

We want to use the `coarseNER`.

In [29]:
annotations[2].sentence[0].token[4]

word: "Italian"
pos: "JJ"
value: "Italian"
originalText: "Italian"
ner: "NATIONALITY"
lemma: "italian"
beginChar: 25
endChar: 32
tokenBeginIndex: 4
tokenEndIndex: 5
hasXmlContext: false
isNewline: false
coarseNER: "O"
fineGrainedNER: "NATIONALITY"
entityMentionIndex: 3
nerLabelProbs: "O=0.870902471545891"

When I didn't set `"ner.applyNumericClassifiers": False` this would come up as a `NUMBER`.

In [30]:
annotations[3].sentence[0].token[3]

word: "3"
pos: "CD"
value: "3"
originalText: "3"
ner: "O"
lemma: "3"
beginChar: 20
endChar: 21
tokenBeginIndex: 3
tokenEndIndex: 4
hasXmlContext: false
isNewline: false
coarseNER: "O"
fineGrainedNER: "O"
nerLabelProbs: "O=0.8599887537555505"

We can then flatten the sentences and extract the NER tokens

In [31]:
from dataclasses import dataclass, asdict

@dataclass
class NERData:
    ner: List[str]
    tokens: List[str]
        
    # Let's use Pandas to make it pretty in a notebook
    def _repr_html_(self):
        return pd.DataFrame(asdict(self)).T._repr_html_()

def extract_ner_data(annotation) -> NERData:
    tokens = [token for sentence in annotation.sentence for token in sentence.token]
    return NERData(tokens=[t.word for t in tokens], ner=[t.coarseNER for t in tokens])

A relatively simple ingredient works well

In [32]:
extract_ner_data(annotations[0])

Unnamed: 0,0,1,2,3,4
ner,QUANTITY,UNIT,O,TEMP,NAME
tokens,1,cup,of,frozen,peas


A more complex sentence does quite badly, perhaps because this kind of thing wasn't seen.

In [33]:
extract_ner_data(annotations[1])

Unnamed: 0,0,1,2,3,4,5,6,7
ner,QUANTITY,UNIT,NAME,NAME,NAME,NAME,O,O
tokens,A,dash,of,salt,.,Or,to,taste


In [34]:
extract_ner_data(annotations[2])

Unnamed: 0,0,1,2,3,4,5,6,7,8
ner,QUANTITY,UNIT,NAME,O,O,O,O,O,O
tokens,12,slices,pancetta,-LRB-,Italian,unsmoked,cured,bacon,-RRB-


We can chain these functions together to get from text to NER

In [35]:
from typing import Dict

def ner_extract(ner_model_file: str, texts: List[str], tokenize_whitespace: bool = True) -> List[Dict[str, List[str]]]:
    annotations = annotate_ner(ner_model_file, texts, tokenize_whitespace)
    return [extract_ner_data(ann) for ann in annotations]

And then for each model, and test data we can calculate the predictions.

Let's convert into a list of list of (token, label) tuples.
(There are simpler ways to do it, but this is how I originally processed the data.)

In [36]:
df_test_short = df_test.groupby(['source', 'ingredient_id']).agg(token=('token', list), label=('label', list))
df_test_short

Unnamed: 0_level_0,Unnamed: 1_level_0,token,label
source,ingredient_id,Unnamed: 2_level_1,Unnamed: 3_level_1
ar,0,"[1/2, large, sweet, red, onion, ,, thinly, sli...","[QUANTITY, SIZE, NAME, NAME, NAME, O, O, STATE]"
ar,1,"[6, ounces, fresh, mushrooms, ,, sliced]","[QUANTITY, UNIT, DF, NAME, O, STATE]"
ar,2,"[4, corn, tortillas]","[QUANTITY, NAME, NAME]"
ar,3,"[1, -LRB-, 9, ounce, -RRB-, package, refrigera...","[QUANTITY, O, QUANTITY, UNIT, O, UNIT, DF, NAM..."
ar,4,"[2, tablespoons, chopped, fresh, basil, leaves]","[QUANTITY, UNIT, STATE, DF, NAME, NAME]"
...,...,...,...
gk,1700,"[3/4, lb, powdered, sugar, -LRB-, Add, more, i...","[QUANTITY, UNIT, STATE, NAME, O, O, O, O, O, O..."
gk,1701,"[1, teaspoon, tender, thyme, leaves]","[QUANTITY, UNIT, NAME, NAME, NAME]"
gk,1702,"[1, tablespoon, chopped, fresh, parsley, leaves]","[QUANTITY, UNIT, STATE, DF, NAME, NAME]"
gk,1703,"[1, cup, Russian, salad, dressing, -LRB-, bott...","[QUANTITY, UNIT, NAME, NAME, O, O, O, O, O, O, O]"


In [37]:
test_data = {'ar': [], 'gk': []}
for (source, idx), row in df_test_short.iterrows():
    test_data[source].append(list(zip(*row)))

In [38]:
test_data['ar'][:2]

[[('1/2', 'QUANTITY'),
  ('large', 'SIZE'),
  ('sweet', 'NAME'),
  ('red', 'NAME'),
  ('onion', 'NAME'),
  (',', 'O'),
  ('thinly', 'O'),
  ('sliced', 'STATE')],
 [('6', 'QUANTITY'),
  ('ounces', 'UNIT'),
  ('fresh', 'DF'),
  ('mushrooms', 'NAME'),
  (',', 'O'),
  ('sliced', 'STATE')]]

Then we can run predictions over it.

In [39]:
preds = {}
for model, modelfile in models.items():
    preds[model] = {}
    for test_source, token_data in test_data.items():
        texts = [' '.join([x[0] for x in text]) for text in token_data]
        preds[model][test_source] = ner_extract(modelfile, texts)

  0%|          | 0/483 [00:00<?, ?it/s]

  0%|          | 0/1705 [00:00<?, ?it/s]

  0%|          | 0/483 [00:00<?, ?it/s]

  0%|          | 0/1705 [00:00<?, ?it/s]

  0%|          | 0/483 [00:00<?, ?it/s]

  0%|          | 0/1705 [00:00<?, ?it/s]

## Sanity checks

Let's check the same tokens come through the model as were input

In [40]:
for test_source, token_data in test_data.items():
    tokens = [[x[0] for x in tokens] for tokens in token_data]
    
    for model in models:
        model_preds = preds[model][test_source]
        
        model_tokens = [p.tokens for p in model_preds]
        
        if tokens != model_tokens:
            raise ValueError("Tokenization issue in %s with model %s" % (test_source, model))

# Evaluating

Now that we have predictions we can evaulate with [seqeval](https://github.com/chakki-works/seqeval).

In [41]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
     |████████████████████████████████| 43 kB 658 kB/s             
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l- \ | done
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=f3610baed1cfe8ba6f307fcc0229689e77e52f28c1a58bb10ec9b7ee6a65b75e
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


Seqeval expects the data to be in one of the following formats:

* IOB1
* IOB2
* IOE1
* IOE2
* IOBES(only in strict mode)
* BILOU(only in strict mode)

These all become important when trying to distinguish distinct entities that are adjacent; these are quite rare in practice.
See Wikipedia for a detailed explanation of [IOB (inside-outside-beginning)](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)).

In this case it's assumed there's only one entity of each type (which can be wrong when multiple names are listing in a single ingredient).
We can easily convert it to IOB1 using this assumption by prefixing every tag other than 'O' with an 'I-'.

In [42]:
def convert_to_iob1(tokens):
    return ['I-' + label if label != 'O' else 'O' for label in tokens]

assert convert_to_iob1(['QUANTITY', 'SIZE', 'NAME', 'NAME', 'O', 'STATE']) == ['I-QUANTITY', 'I-SIZE', 'I-NAME', 'I-NAME', 'O', 'I-STATE']

Let's check the classification report for a single example and compare it to the report from StanfordNER.

The classification report doesn't have the TP, TN and FN, but instead has the support - the number of true entities in the data.
The set of data is equivalent:

* support = TP + FN
* TP = R * support
* FP = TP (1/P - 1)
* FN = support - TP

The results are the same.

In [43]:
from seqeval.metrics import classification_report

test_source = 'ar'
model = 'ar'

actual_ner = [convert_to_iob1([x[1] for x in ann]) for ann in test_data[test_source]]
pred_ner = [convert_to_iob1(p.ner) for p in preds[model][test_source]]

print(classification_report(actual_ner, pred_ner, digits=4))

              precision    recall  f1-score   support

          DF     1.0000    0.9608    0.9800        51
        NAME     0.9297    0.9279    0.9288       499
    QUANTITY     1.0000    0.9962    0.9981       524
        SIZE     1.0000    1.0000    1.0000        20
       STATE     0.9601    0.9633    0.9617       300
        TEMP     0.8750    0.7000    0.7778        10
        UNIT     0.9819    0.9841    0.9830       441

   micro avg     0.9696    0.9669    0.9682      1845
   macro avg     0.9638    0.9332    0.9471      1845
weighted avg     0.9695    0.9669    0.9682      1845



We can get the micro f1-score directly.

In [44]:
from seqeval.metrics import f1_score
'%0.4f' % f1_score(actual_ner, pred_ner)

'0.9682'

We can then try to reproduce Table IV by computing the f1-score for each model and data.

In [45]:
scores = {model: {} for model in models}
for test_source, data in test_data.items():
    actual_ner = [convert_to_iob1([x[1] for x in ann]) for ann in data]
    for model in models:
        pred_ner = [convert_to_iob1(p.ner) for p in preds[model][test_source]]
        scores[model][test_source] = f1_score(actual_ner, pred_ner)

We also need to calculate the scores on the combined test set, by contatenating them

In [46]:
actual_ner = [convert_to_iob1([x[1] for x in ann]) for data in test_data.values() for ann in data]
for model in models:
    pred_ner = [convert_to_iob1(p.ner) for test_source in test_data for p in preds[model][test_source]]
    scores[model]['combined'] = f1_score(actual_ner, pred_ner)

In [47]:
pd.DataFrame(scores).style.format('{:0.4f}')

Unnamed: 0,ar,gk,ar_gk
ar,0.9682,0.9323,0.9704
gk,0.8666,0.9515,0.9493
combined,0.8911,0.9469,0.9544


The results are *slightly* different to those in the paper, but all agree within 0.01 for each row.

So we've successfully reproduced the results in the paper, and shown the evaulation from Stanford NER toolkit is very close to that of seqeval (if you work around hallucinated entities).

In [48]:
reported_scores = pd.DataFrame([[0.9682, 0.9317, 0.9709],
              [0.8672, 0.9519, 0.9498],
              [0.8972, 0.9472, 0.9611]],
             columns = ['AllRecipes', 'FOOD.com', 'BOTH'],
             index = ['AllRecipes', 'FOOD.com', 'BOTH'])
reported_scores

Unnamed: 0,AllRecipes,FOOD.com,BOTH
AllRecipes,0.9682,0.9317,0.9709
FOOD.com,0.8672,0.9519,0.9498
BOTH,0.8972,0.9472,0.9611


Differences

In [49]:
reported_scores - pd.DataFrame(scores).to_numpy()

Unnamed: 0,AllRecipes,FOOD.com,BOTH
AllRecipes,-5e-05,-0.000551,0.000495
FOOD.com,0.000631,0.000406,0.000544
BOTH,0.006084,0.000272,0.00671
