#  CRF + expert.ai edge NL API for named entities recognition


In [16]:
import os
from seqeval.metrics import classification_report

---

## Data Preparation
CoNLL corpus is download and prepared for the training phase

### Methods for processing CoNLL corpus

In [17]:
CONLL_URL_ROOT = "https://raw.githubusercontent.com/nluninja/nlp_datasets/be9fd23409f1443790f6e1eab91d28b105769368/conll2003/data/"

In [18]:
import os
import re
import urllib
import pandas as pd
from math import nan

In [24]:
def load_conll_data(filename, url_root=CONLL_URL_ROOT, 
                    only_tokens=False):
    """
    Take an url to the raw .txt files that you can find the repo linked above,
    load data and save it into a list of tuples data structure.
    
    Those files structure data with a word in each line with word, POS, 
    syntactic tag and entity tag separated by a whitespace. Sentences are 
    separated by an empty line.
    """
    lines = read_raw_conll(url_root, filename)
    X = []
    Y = []
    sentence = []
    labels = []
    output_labels=set()
    for line in lines:
        if line == "\n":
            if(len(sentence) != len(labels)):
                print(f"Error: we have {len(sentence)} words but {len(labels)} labels")
            if sentence and is_real_sentence(only_tokens, sentence):
                X.append(sentence)
                Y.append(labels)
            sentence = []
            labels = []
        else:
            features = line.split()
            tag = features.pop()
            labels.append(tag)
            output_labels.add(tag)
            if only_tokens:
                sentence.append(features.pop(0))
            else:
                sentence.append(tuple(features))
    
    print(f"Read {len(X)} sentences")
    if(len(X) != len(Y)):
        print("ERROR in reading data.")
    return X, Y, output_labels

In [25]:
def read_raw_conll(url_root, filename):
    """Read a file which contains a conll03 dataset"""
    lines = []
    full_url = url_root + filename
    lines = open_read_from_url(full_url)
    return lines[2:]

In [26]:
def open_read_from_url(url):
    """
    Take in input an url to a .txt file and return the list of its raws
    """
    print(f"Read file from {url}")
    file = urllib.request.urlopen(url)
    lines = []
    for line in file:
        lines.append(line.decode("utf-8"))

    return lines

In [27]:
def is_real_sentence(only_token, sentence):
    """Chek if a sentence is a real sentence or a document separator"""
    first_word = ""
    if only_token:
        first_word = sentence[0]
    else:
        first_word = sentence[0][0]

    if '---------------------' in first_word or first_word == '-DOCSTART-':
        return False
    else:
        return True

### Data loading

In [28]:
raw_train, y_train, output_labels = load_conll_data('train.txt', only_tokens=True)
raw_valid, y_valid, _ = load_conll_data('valid.txt', only_tokens=True)
raw_test, y_test, _ = load_conll_data('test.txt', only_tokens=True)

Read file from https://raw.githubusercontent.com/nluninja/nlp_datasets/be9fd23409f1443790f6e1eab91d28b105769368/conll2003/data/train.txt
Read 14028 sentences
Read file from https://raw.githubusercontent.com/nluninja/nlp_datasets/be9fd23409f1443790f6e1eab91d28b105769368/conll2003/data/valid.txt
Read 3250 sentences
Read file from https://raw.githubusercontent.com/nluninja/nlp_datasets/be9fd23409f1443790f6e1eab91d28b105769368/conll2003/data/test.txt
Read 3453 sentences


In [29]:
print(raw_train[0])
print(y_train[0])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


## Feature generation with edge NL API

In [32]:
import os
os.environ["EAI_USERNAME"] = 'andrea.belli@gmail.com'
os.environ["EAI_PASSWORD"] = 'eXpert00!'

from expertai.nlapi.edge.client import ExpertAiClient
client = ExpertAiClient()

### Methods for performing tokenization and features generation 

In [34]:
from tqdm import tqdm, trange

In [39]:
def tokens_to_docs(raw, eai):
    """Analyze a sentence with expertai
    
    Take a list of sentences, where each sentence is a list of token; build a
    string with the sentence and analyze it with expertai.
    
    Params:
        raw: list of lists of tokens
        eai: Expertai instance
    Return:
        docs: list of expertai Document
    """
    docs = []
    for sent in tqdm(raw):
        docs.append(eai.full_analysis(' '.join(sent)))
    return docs

In [None]:
def _get_label(doc, syncon):
    """Extract the knowledge label of a syncon in a document, if any"""
    label = ''
    if hasattr(doc.knowledge, '_k'):
        for ent in doc.knowledge._k:
            if ent['syncon'] == syncon:
                label = ent['label']
                break
        if label and '.' in label:
            label = label.split('.')[-1]
    return label

In [None]:
def features_from_docs (sentences, docs):
    """Extract token features from expertai docs
    
    Given a list of tokenized sentences and the relative expertai docs, 
    create a dictionary for each with the doc features:
        * Word
        * PoS tag;
        * Dep tag;
        * Syncon;
        * Label;
        * Typeclass (a mix of POS and entity);
    Params:
        sentences: list of sentences, that are lists of strings;
        docs: list of expertai Document;
    Returns:
        eai_sents: list of sentences features, that are lists of dictionaries;
    """
    eai_sents = []
    for sent_idx in trange(len(sentences)):
        seek = 0    # Index of the part of the sentence string already read
        eai_tokenlist = []
        for tk_idx in range(len(sentences[sent_idx])):
            # Token text and boundary indexes in doc.content
            token = sentences[sent_idx][tk_idx]
            index_start = docs[sent_idx].content.find(token, seek)
            index_end = index_start + len(token)
            possible_tokens = []
            for t in docs[sent_idx].tokens:
                # If a eai Token contain (part of the) chunk od text, it can be
                # the possible corresponding Token
                if (t.start<=index_start and t.end>=index_end) or \
                (t.start >= index_start and t.start <= index_end) or \
                (t.end >= index_start and t.end <= index_end):
                    possible_tokens.append(t)
            if not possible_tokens:
                print('ERROR: expertai tokenization not found for token', token)
                eai_tokenlist.append(_voidtoken())
            else:
                # Extract information from the eai.Token for the raw token we 
                # are analyzing
                if len(possible_tokens)>1:
                    possible_tokens.sort(key = lambda t: t.syncon, reverse=True)
                new_token = {
                    'word': token,
                    'pos': possible_tokens[0].pos,
                    'syncon': possible_tokens[0].syncon,
                    'ancestor': -1,
                    'label': _get_label(docs[sent_idx], possible_tokens[0].syncon),
                    'dep': possible_tokens[0].dependency.label,
                    'typeclass': possible_tokens[0].typeclass.split('.')
                }
                eai_tokenlist.append(new_token)
            seek = index_end
            while docs[sent_idx].content[seek] == ' ':
                seek += 1
        eai_sents.append(eai_tokenlist)
    return eai_sents


In [None]:
def features_from_word(sentence, idx):
    """Extract features related to a word and its neighbours"""
    token = sentence[idx] 
    
    features = {
        'bias': 1.0,
        'word.lower()': token['word'].lower(),
        'word[-3:]': token['word'][-3:],
        'word[-2:]': token['word'][-2:],
        'word.isupper()': token['word'].isupper(),
        'word.istitle()': token['word'].istitle(),
        'word.isdigit()': token['word'].isdigit(),
        'eai.postag': token['pos'],
        'eai.postag[:2]': token['pos'][:2],
        'eai.deptag': token['dep'],
        'eai.deptag[-2:]': token['dep'][-2:],
        'eai.syncon': -1 if token['syncon'] == -1 else token['syncon'] / 10000.,
        'eai.ancestor': -1 if token['ancestor'] == -1 else token['ancestor'] / 10000.,
        'eai.labels': token['label'],
        'eai.typeclass': token['typeclass'],
    }
    if idx > 0:
        token1 = sentence[idx-1]
        features.update({
            '-1:word.lower()': token1['word'].lower(),
            '-1:word.istitle()': token1['word'].istitle(),
            '-1:word.isupper()': token1['word'].isupper(),
            '-1:eai.postag': token1['pos'],
            '-1:eai.deptag': token1['dep'],
            '-1:eai.labels': token1['label'],
            '-1:eai.typeclass': token1['typeclass'],
        })
    else:
        features['BOS'] = True
        
    if idx < len(sentence)-1:
        token1 = sentence[idx-1]
        features.update({
            '+1:word.lower()': token1['word'].lower(),
            '+1:word.istitle()': token1['word'].istitle(),
            '+1:word.isupper()': token1['word'].isupper(),
            '+1:eai.postag': token1['pos'],
            '+1:eai.deptag': token1['dep'],
            '+1:eai.labels': token1['label'],
            '+1:eai.typeclass': token1['typeclass'],
        })
    else:
        features['EOS'] = True
                
    return features

In [None]:
def features_from_sentence(sentence):
    """Create feature dictionary for a sentence"""
    return tuple(features_from_word(sentence, index) for index in range(len(sentence)))

In [None]:
def _voidtoken():
    """Generate an empty token"""
    t = {
        'word': '',
        'pos': '',
        'syncon': -1,
        'ancestor': -1,
        'dep': '',
        'label': ''
    }
    return t

### Generate tokens and features

In [None]:
train_docs = tokens_to_docs(raw_train, client)
test_docs = tokens_to_docs(raw_test, client)
valid_docs = tokens_to_docs(raw_valid, client)

 72%|██████████████████████████████████████████████████████████████████▉                          | 10101/14028 [24:04<08:42,  7.52it/s]

In [6]:
train = features_from_docs(raw_train, train_docs)
test = features_from_docs(raw_test, test_docs)
valid = features_from_docs(raw_valid, valid_docs)

100%|██████████| 14027/14027 [00:59<00:00, 236.62it/s]
100%|██████████| 3452/3452 [00:12<00:00, 272.75it/s]
100%|██████████| 3249/3249 [00:15<00:00, 215.54it/s]


In [7]:
import pprint
p_idx=2
print(raw_train[p_idx])
print(y_train[p_idx])
print('')
pprint.pprint(train[p_idx])
print('')
pprint.pprint([tk.__dict__ for tk in train_docs[p_idx].tokens])

['BRUSSELS', '1996-08-22']
['B-LOC', 'O']

[{'ancestor': 291029,
  'dep': 'root',
  'label': 'town',
  'pos': 'PROPN',
  'syncon': 38239,
  'typeclass': ['NPR', 'GEO'],
  'word': 'BRUSSELS'},
 {'ancestor': -1,
  'dep': 'nmod',
  'label': '',
  'pos': 'NUM',
  'syncon': -1,
  'typeclass': ['NOU', 'DAT'],
  'word': '1996-08-22'}]

[{'atoms': [],
  'dependency': <expertai.document.Dependency object at 0x000001BC96BAABE0>,
  'end': 8,
  'id': 0,
  'lemma': 'Brussels',
  'morphology': 'Number=Sing',
  'paragraph': 0,
  'phrase': 0,
  'pos': 'PROPN',
  'sentence': 0,
  'start': 0,
  'syncon': 38239,
  'typeclass': 'NPR.GEO'},
 {'atoms': [<expertai.document.Atom object at 0x000001BC96BAA0F0>,
            <expertai.document.Atom object at 0x000001BC96BAA0B8>,
            <expertai.document.Atom object at 0x000001BC96BAA080>,
            <expertai.document.Atom object at 0x000001BC96BAA048>,
            <expertai.document.Atom object at 0x000001BC96BAA128>,
            <expertai.document.Atom o

#### Features Function

In [8]:
X_train = [features_from_sentence(sentence) for sentence in train]
X_test = [features_from_sentence(sentence) for sentence in test]
X_valid = [features_from_sentence(sentence) for sentence in valid]
pprint.pprint(X_train[1])

({'+1:nlpy.deptag': 'root',
  '+1:nlpy.labels': '',
  '+1:nlpy.postag': 'PROPN',
  '+1:nlpy.typeclass': ['NPR', 'NPH'],
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'blackburn',
  'BOS': True,
  'bias': 1.0,
  'nlpy.ancestor': -1,
  'nlpy.deptag': 'root',
  'nlpy.deptag[-2:]': 'ot',
  'nlpy.labels': '',
  'nlpy.postag': 'PROPN',
  'nlpy.postag[:2]': 'PR',
  'nlpy.syncon': -1,
  'nlpy.typeclass': ['NPR', 'NPH'],
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'peter',
  'word[-2:]': 'er',
  'word[-3:]': 'ter'},
 {'-1:nlpy.deptag': 'root',
  '-1:nlpy.labels': '',
  '-1:nlpy.postag': 'PROPN',
  '-1:nlpy.typeclass': ['NPR', 'NPH'],
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:word.lower()': 'peter',
  'EOS': True,
  'bias': 1.0,
  'nlpy.ancestor': -1,
  'nlpy.deptag': 'root',
  'nlpy.deptag[-2:]': 'ot',
  'nlpy.labels': '',
  'nlpy.postag': 'PROPN',
  'nlpy.postag[:2]': 'PR',
  'nlpy.sy

---

## Training the model

In [10]:
fast_mode = True

In [11]:
%%time
crf = None
gs = None

if fast_mode:
    crf = sklearn_crfsuite.CRF(
        algorithm = 'lbfgs',
        c1 = 0.1,
        c2 = 0.5,
        max_iterations = 800,
        all_possible_transitions = True,
        verbose = True
    )
    crf.fit(X_train, y_train, X_dev=X_valid, y_dev=y_valid)
else:
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        all_possible_transitions=True,
        verbose=True
    )
    params_space = {
        'c1': [0, .1, .4],
        'c2': [.1, .5, 1],
        'max_iterations': [200, 400, 800],
    }
    labels = list(output_labels.copy())
    labels.remove('O')
    f1_scorer = make_scorer(metrics.flat_f1_score,
                            average='weighted', labels=labels)
    gs = GridSearchCV(crf, params_space, cv=3, n_jobs=4,
                      verbose=1, scoring=f1_scorer)
    gs = modelutils.get_crf_gridsearch(output_labels)
    gs.fit(X_train, y_train)
    crf = gs.best_estimator_

loading training data to CRFsuite: 100%|██████████| 14027/14027 [00:04<00:00, 2834.98it/s]
loading dev data to CRFsuite:  10%|▉         | 319/3249 [00:00<00:00, 3189.78it/s]




loading dev data to CRFsuite: 100%|██████████| 3249/3249 [00:01<00:00, 2589.06it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 88392
Seconds required: 1.012

L-BFGS optimization
c1: 0.100000
c2: 0.500000
num_memories: 6
max_iterations: 800
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.58  loss=439086.71 active=88140 precision=0.162  recall=0.212  F1=0.169  Acc(item/seq)=0.579 0.026  feature_norm=1.00
Iter 2   time=0.34  loss=320293.59 active=87072 precision=0.092  recall=0.111  F1=0.101  Acc(item/seq)=0.830 0.193  feature_norm=0.94
Iter 3   time=0.33  loss=144600.42 active=85336 precision=0.140  recall=0.163  F1=0.151  Acc(item/seq)=0.833 0.225  feature_norm=1.76
Iter 4   time=0.61  loss=134997.27 active=87866 precision=0.164  recall=0.137  F1=0.140  Acc(item/seq)=0.837 0.239  feature_norm=1.83
Iter 5   time=0.33  loss=131335.71 active=880

In [12]:
if not fast_mode:
    from joblib import dump
    with open(os.path.join('models', 'crfxnlpy-conll.joblib'), 'wb') as f:
        dump(gs.best_estimator_, f)

---

## Model Evaluation

In [13]:
if not fast_mode:
    print('best params:', gs.best_params_)
    print('best CV score:', gs.best_score_)

In [14]:
print('Model size: {:0.2f}M'.format(crf.size_ / 1000000))

Model size: 1.55M


In [15]:
print(f'Model latency in prediction: {modelutils.compute_prediction_latency(X_test, crf):.3} s')

Model latency in prediction: 0.00029 s


In [16]:
datasets = [('Training Set', X_train, y_train), ('Test Set', X_test, y_test), ('Validation Set', X_valid, y_valid)]

for title, X, Y in datasets:
    Y_pred = crf.predict(X)
    print(title)
    print(classification_report(Y, Y_pred, digits=3))
    print('\n')

Training Set
           precision    recall  f1-score   support

      ORG      0.959     0.946     0.952      6318
      LOC      0.979     0.971     0.975      7140
      PER      0.973     0.981     0.977      6600
     MISC      0.971     0.927     0.949      3438

micro avg      0.971     0.961     0.966     23496
macro avg      0.971     0.961     0.966     23496



Test Set
           precision    recall  f1-score   support

      PER      0.893     0.874     0.883      1616
      LOC      0.891     0.899     0.895      1667
     MISC      0.771     0.723     0.746       701
      ORG      0.822     0.782     0.801      1660

micro avg      0.857     0.835     0.846      5644
macro avg      0.856     0.835     0.846      5644



Validation Set
           precision    recall  f1-score   support

     MISC      0.918     0.818     0.865       922
      PER      0.927     0.922     0.925      1842
      ORG      0.853     0.821     0.837      1340
      LOC      0.936     0.946    

---

## Model Explanation (beta)

In [17]:
import eli5

Using TensorFlow backend.


In [18]:
eli5.show_weights(crf, horizontal_layout=False)



From \ To,O,B-LOC,I-LOC,B-MISC,I-MISC,B-ORG,I-ORG,B-PER,I-PER
O,1.524,2.373,-3.072,1.576,-3.074,2.456,-2.975,3.407,-2.952
B-LOC,0.479,-1.169,5.583,0.52,-1.478,-0.28,-1.506,-0.503,-0.872
I-LOC,-0.102,-0.591,4.78,-0.133,-0.636,-1.109,-0.634,-0.511,-0.345
B-MISC,-0.583,-0.005,-1.268,-0.79,5.7,0.366,-1.828,-0.0,-0.869
I-MISC,-0.512,-0.029,-0.521,-0.116,5.565,0.257,-0.551,-0.898,-0.491
B-ORG,0.402,-1.551,-1.068,-0.72,-1.341,-0.495,7.185,-0.669,-1.68
I-ORG,-0.104,-1.08,-0.773,-0.576,-0.696,-1.11,6.586,-0.661,-0.892
B-PER,0.165,-0.214,-0.627,-1.238,-0.67,-1.373,-0.504,-2.39,5.526
I-PER,0.691,-0.68,-0.374,-0.572,-0.296,-0.622,-0.205,-1.586,3.343

Weight?,Feature
+4.315,bias
+3.019,word[-2:]:0M
+3.012,word[-2:]:5M
+2.831,word.lower():minister
+2.733,nlpy.labels:definite_time
+2.605,BOS
+2.339,word.lower():h1
+2.339,word[-2:]:H1
+2.339,word[-3:]:H1
+2.263,word[-3:]:day

Weight?,Feature
+2.398,word.lower():pakistan
+2.371,word.lower():caribbean
+2.312,-1:word.lower():at
+1.999,+1:nlpy.typeclass:DAT
+1.916,word.lower():england
+1.882,word.lower():china
+1.826,nlpy.labels:country
+1.767,nlpy.typeclass:GEO
+1.752,word.lower():u.s.
+1.735,word.lower():prairies

Weight?,Feature
+1.824,word.lower():hospital
+1.704,nlpy.labels:boundary
+1.680,-1:word.lower():wisc
+1.635,-1:word.lower():colo
+1.514,nlpy.typeclass:GEO
+1.136,-1:word.lower():new
+1.092,+1:word.lower():azad
+1.092,-1:word.lower():azad
+1.092,word.lower():kashmir
+1.086,nlpy.labels:country

Weight?,Feature
+2.391,word.lower():u.s.-led
+2.356,nlpy.labels:nationality
+2.133,nlpy.labels:inhabitant
+2.113,word.lower():euromark
+2.089,word.lower():serb-held
+2.068,word[-3:]:led
+2.036,word[-3:]:sed
+1.949,word.lower():islamists
+1.906,word[-2:]:Rs
+1.906,word.lower():adrs

Weight?,Feature
+1.975,word.lower():open
+1.941,word.lower():cup
+1.873,word.lower():division
+1.787,word.lower():masters
+1.677,word.lower():league
+1.621,-1:word.lower():no
+1.615,word[-3:]:sed
+1.593,word.isdigit()
+1.414,word.lower():day
+1.313,word.lower():nations

Weight?,Feature
+2.709,+1:word.lower():3
+2.613,+1:word.lower():4
+2.587,-1:word.lower():v
+2.572,+1:word.lower():1
+2.470,word.lower():barrick
+2.435,+1:word.lower():2
+2.427,+1:word.lower():7
+2.402,word.lower():senate
+2.397,+1:word.lower():0
+2.365,word.lower():nice

Weight?,Feature
+1.724,word.lower():union
+1.639,word[-3:]:oom
+1.581,+1:word.lower():bj
+1.581,-1:word.lower():bj
+1.445,word.lower():newsroom
+1.403,-1:word.lower():moody
+1.403,+1:word.lower():moody
+1.374,nlpy.labels:institution
+1.373,word.lower():coast
+1.346,+1:word.lower():lloyd

Weight?,Feature
+3.738,word.lower():clinton
+2.310,word.lower():gore
+2.203,word.lower():dole
+2.093,BOS
+1.914,word[-3:]:Haq
+1.914,word.lower():inzamam-ul-haq
+1.859,word[-2:]:ER
+1.769,word.lower():cork
+1.763,word.lower():ata-ur-rehman
+1.689,word.lower():pivotal

Weight?,Feature
+1.522,nlpy.typeclass:NPH
+1.316,-1:word.lower():de
+1.233,+1:word.lower():de
+1.160,word[-3:]:ULO
+1.159,word[-2:]:LO
+1.062,word[-3:]:ton
+1.029,word[-2:]:io
+1.004,-1:word.lower():van
+0.999,-1:nlpy.labels:
+0.990,+1:word.lower():van


In [19]:
eli5.show_weights(crf, top=(10, 10), feature_re='^nlpy\.', horizontal_layout=False)



From \ To,O,B-LOC,I-LOC,B-MISC,I-MISC,B-ORG,I-ORG,B-PER,I-PER
O,1.524,2.373,-3.072,1.576,-3.074,2.456,-2.975,3.407,-2.952
B-LOC,0.479,-1.169,5.583,0.52,-1.478,-0.28,-1.506,-0.503,-0.872
I-LOC,-0.102,-0.591,4.78,-0.133,-0.636,-1.109,-0.634,-0.511,-0.345
B-MISC,-0.583,-0.005,-1.268,-0.79,5.7,0.366,-1.828,-0.0,-0.869
I-MISC,-0.512,-0.029,-0.521,-0.116,5.565,0.257,-0.551,-0.898,-0.491
B-ORG,0.402,-1.551,-1.068,-0.72,-1.341,-0.495,7.185,-0.669,-1.68
I-ORG,-0.104,-1.08,-0.773,-0.576,-0.696,-1.11,6.586,-0.661,-0.892
B-PER,0.165,-0.214,-0.627,-1.238,-0.67,-1.373,-0.504,-2.39,5.526
I-PER,0.691,-0.68,-0.374,-0.572,-0.296,-0.622,-0.205,-1.586,3.343

Weight?,Feature
+2.733,nlpy.labels:definite_time
+2.195,nlpy.postag:PRON
+2.097,nlpy.typeclass:CON
+2.027,nlpy.typeclass:DAT
+1.846,nlpy.labels:graphic_symbol
+1.601,nlpy.typeclass:PRO
+1.420,nlpy.postag[:2]:PR
+1.280,nlpy.deptag:case
+1.257,nlpy.labels:indefinite_quantity
+1.226,nlpy.labels:virtual_object

Weight?,Feature
+1.826,nlpy.labels:country
+1.767,nlpy.typeclass:GEO
+1.294,nlpy.typeclass:ADR
+1.177,nlpy.labels:physical_property
+1.155,nlpy.typeclass:BLD
+1.060,nlpy.typeclass:GEA
+0.911,nlpy.typeclass:MMD
+0.880,nlpy.labels:first-order_administrative_division
+0.831,nlpy.labels:metal
+0.822,nlpy.labels:town

Weight?,Feature
+1.704,nlpy.labels:boundary
+1.514,nlpy.typeclass:GEO
+1.086,nlpy.labels:country
+0.930,nlpy.labels:other
+0.929,nlpy.typeclass:GEA
+0.923,nlpy.labels:geographic_area
+0.902,nlpy.typeclass:PNT
+0.713,nlpy.labels:aristocrat
+0.663,nlpy.labels:physical_property
+0.619,nlpy.typeclass:BLD

Weight?,Feature
+2.356,nlpy.labels:nationality
+2.133,nlpy.labels:inhabitant
+1.612,nlpy.labels:language
+1.523,nlpy.typeclass:MON
+1.458,nlpy.labels:political_association
+1.342,nlpy.typeclass:LEN
+1.339,nlpy.deptag:amod
+1.292,nlpy.labels:sporting_event
+1.284,nlpy.typeclass:DAT
+1.275,nlpy.typeclass:NOU

Weight?,Feature
+1.171,nlpy.labels:inhabitant
+1.070,nlpy.typeclass:EVN
+1.037,nlpy.labels:military_action
+1.020,nlpy.typeclass:WRK
+0.964,nlpy.postag[:2]:NU
+0.964,nlpy.postag:NUM
+0.933,nlpy.labels:separation
+0.881,nlpy.labels:war
+0.878,nlpy.labels:rule
+0.876,nlpy.labels:sporting_event

Weight?,Feature
+2.011,nlpy.labels:sport_team
+1.968,nlpy.labels:second-order_administrative_division
+1.419,nlpy.labels:committee
+1.410,nlpy.labels:color
+1.396,nlpy.labels:bird
+1.326,nlpy.labels:institution
+1.306,nlpy.labels:system_(object)
+1.198,nlpy.labels:mammal
+1.194,nlpy.labels:town
+1.181,nlpy.labels:employee

Weight?,Feature
+1.374,nlpy.labels:institution
+1.296,nlpy.typeclass:MMD
+1.040,nlpy.labels:sport_team
+1.035,nlpy.labels:substance
+0.996,nlpy.labels:document
+0.964,nlpy.labels:vehicle
+0.889,nlpy.typeclass:COM
+0.804,nlpy.labels:mammal
+0.794,nlpy.labels:organization
+0.750,nlpy.labels:commodity

Weight?,Feature
+1.425,nlpy.labels:
+1.216,nlpy.labels:object
+1.146,nlpy.labels:meat
+1.129,nlpy.labels:blood
+1.049,nlpy.labels:product
+1.038,nlpy.labels:worker
+1.030,nlpy.labels:literary_creation
+0.888,nlpy.labels:mammal
+0.848,nlpy.labels:ball
+0.829,nlpy.typeclass:NPH

Weight?,Feature
+1.522,nlpy.typeclass:NPH
+0.659,nlpy.labels:change
+0.652,nlpy.labels:social_relation
+0.644,nlpy.typeclass:ANM
+0.555,nlpy.labels:object_part
+0.544,nlpy.labels:island
+0.507,nlpy.postag:ADP
+0.458,nlpy.labels:substance
+0.446,nlpy.typeclass:PRE
+0.411,nlpy.labels:politician
