## CRF for Aspect Tagger

In [3]:
from lxml import etree
from operator import itemgetter
from collections import Counter

from pycrfsuite import Trainer, Tagger
from pycrfsuite import ItemSequence

from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelBinarizer
from nltk.metrics import ConfusionMatrix

In [4]:
CORPUS_TRAIN = '../corpus/SemEvalABSA2016EnglishRestaurants_train.xml'
CORPUS_TEST = '../corpus/SemEvalABSA2016EnglishRestaurants_test.xml'

In [12]:
# read ReLi sentences with tags
dataset = list()
tree = etree.parse(CORPUS_TRAIN)

tokens_nodes = tree.xpath('.//tokens')
for tokens_node in tokens_nodes:
    sentence = list()
    for word_node in tokens_node:
        if word_node.get('opinion')=='true':
            tag = 'target'
        else:
            tag = '_'
        sentence.append((word_node.get('form').lower(),
                         word_node.get('postag'),
                         word_node.get('head'),
                         word_node.get('deprel'),
                         tag))
    if len(sentence) != 0:
        dataset.append(sentence)        

In [13]:
# change head number by the token under that position
for sentence in dataset:
    for index, item in enumerate(sentence):
        word, postag, head, deprel, tag = item
        head = int(head)
        if head > 0:
            head = sentence[int(head) - 1][0]
        else:
            head = 'ROOT'
        sentence[index] = ((word, postag, head, deprel, tag))        


In [14]:
dataset[0:2]

[[('just', 'RB', 'went', 'advmod', '_'),
  ('went', 'VBD', 'ROOT', 'root', '_'),
  ('here', 'RB', 'went', 'advmod', '_'),
  ('for', 'IN', 'bday', 'case', '_'),
  ('my', 'PRP$', 'bday', 'nmod:poss', '_'),
  ('girlfriends', 'NNS', 'bday', 'compound', '_'),
  ('23rd', 'JJ', 'bday', 'amod', '_'),
  ('bday', 'NN', 'went', 'nmod', '_'),
  ('.', '.', 'went', 'punct', '_')],
 [('if', 'IN', 'river', 'mark', '_'),
  ('you', 'PRP', 'river', 'nsubj', '_'),
  ("'ve", 'VBP', 'river', 'aux', '_'),
  ('ever', 'RB', 'river', 'advmod', '_'),
  ('been', 'VBN', 'river', 'cop', '_'),
  ('along', 'IN', 'river', 'case', '_'),
  ('the', 'DT', 'river', 'det', '_'),
  ('river', 'NN', 'have', 'advcl', '_'),
  ('in', 'IN', 'weehawken', 'case', '_'),
  ('weehawken', 'NNP', 'river', 'nmod', '_'),
  ('you', 'PRP', 'have', 'nsubj', '_'),
  ('have', 'VBP', 'ROOT', 'root', '_'),
  ('an', 'DT', 'idea', 'det', '_'),
  ('idea', 'NN', 'have', 'dobj', '_'),
  ('of', 'IN', 'top', 'case', '_'),
  ('the', 'DT', 'top', 'det', '

In [15]:
def extract_features(dataset):
    feature_list = []
    for sentence in dataset:
        for item in sentence:
            word, postag, head, deprel, tag = item
            features = {}
            features['bias'] = 1
            features['word'] = word
            features['postag'] = postag
            features['head'] = head
            feature_list.append(features)
    return feature_list

In [16]:
def extract_labels(dataset):
    labels = []
    for sentence in dataset:
        for item in sentence:
            word, postag, head, deprel, tag = item
            labels.append(tag)
    return labels

In [17]:
def crf_train(dataset):
    trainer = Trainer(verbose=False)

    # default parameters
    trainer.select(algorithm='lbfgs', type='crf1d')

    # set from examples
    trainer.set_params({
        'c1': 0.1,   # coefficient for L1 penalty
        'c2': 1.0,  # coefficient for L2 penalty
    })

    X_train = ItemSequence(extract_features(dataset))
    Y_train = extract_labels(dataset)

    trainer.append(X_train, Y_train)
    trainer.train('model.crfsuite')


In [18]:
def reports(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(y_true)
    y_pred_combined = lb.transform(y_pred)

    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    accuracy = accuracy_score(y_true_combined, y_pred_combined) * 100
    print('Accuracy: {:.2f}%\n'.format(accuracy))

    print('Classification Report:\n')
    print(classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset))

    cm = ConfusionMatrix(y_true, y_pred)
    print (cm.pretty_format(sort_by_count=True))

In [19]:
def tagger_reports(tagger):
    info = tagger.info()

    def print_transitions(trans_features):
        for (label_from, label_to), weight in trans_features:
            print("%-8s -> %-8s %0.4f" % (label_from, label_to, weight))

    print("\nTop likely transitions:")
    print_transitions(Counter(info.transitions).most_common(15))
    print("\nTop unlikely transitions:")
    print_transitions(Counter(info.transitions).most_common()[-15:])

    def print_state_features(state_features):
        for (attr, label), weight in state_features:
            print("%0.6f %-6s %s" % (weight, label, attr))

    print("\nTop positive:")
    print_state_features(Counter(info.state_features).most_common(20))

    print("\nTop negative:")
    print_state_features(Counter(info.state_features).most_common()[-20:])


In [20]:
def get_context(sentence, index):
    if index == 0:
        left_word = ''
    else:
        left_word = sentence[index - 1][0]
    if index == len(sentence) - 1:
        right_word = ''
    else:
        right_word = sentence[index + 1][0]
    context_word = left_word + '_' + sentence[index][0] + '_' + right_word
    return context_word

In [21]:
def statistcs(dataset, y_pred, lookup_tag, threshold):
    count = 0
    freq = dict()
    freq_tag = dict()
    correct = dict()
    incorrect = dict()
    misclassified = dict()
    context = dict()

    for sentence in dataset:
        for index, item in enumerate(sentence):
            word, postag, head, deprel, tag = item
            freq[word] = freq.get(word, 0) + 1
            if tag == lookup_tag:
                freq_tag[word] = freq_tag.get(word, 0) + 1
                if word not in context:
                    context[word] = dict()
                context_word = get_context(sentence, index)
                context[word][context_word] = context[word].get(context_word, 0) + 1

                if tag == y_pred[count]:
                    correct[word] = correct.get(word, 0) + 1
                else:
                    incorrect[word] = incorrect.get(word, 0) + 1
            elif y_pred[count] == lookup_tag:
                misclassified[word] = misclassified.get(word, 0) + 1
            count += 1

    print ('tag\tword\t\tfreq\tfreq_tag\tcorrect\tincorrect\tmisclassified')

    for word, _ in sorted(freq_tag.items(), key=itemgetter(1), reverse=True)[:threshold]:
        print ('{}{}{}{}{}{}{}'.format(lookup_tag + '   ',
                                       word + ' ' * (25 - len(word)),
                                       str(freq.get(word, 0)) + '\t',
                                       str(freq_tag.get(word, 0)) + '\t\t',
                                       str(correct.get(word, 0)) + '\t\t',
                                       str(incorrect.get(word, 0)) + '\t\t',
                                       str(misclassified.get(word, 0))))
    # ', '.join([c + '(' + str(f) + ')' for c, f in sorted(context[word].items(), key=itemgetter(1), reverse=True)][:6])

    # statistcs for each postag
    count = 0
    freq = dict()
    freq_tag = dict()
    correct = dict()
    incorrect = dict()
    misclassified = dict()
    context = dict()
    for sentence in dataset:
        for index, item in enumerate(sentence):
            word, postag, head, deprel, tag = item
            freq[postag] = freq.get(postag, 0) + 1
            if tag == lookup_tag:
                freq_tag[postag] = freq_tag.get(postag, 0) + 1
                if tag == y_pred[count]:
                    correct[postag] = correct.get(postag, 0) + 1
                else:
                    incorrect[postag] = incorrect.get(postag, 0) + 1
            elif y_pred[count] == lookup_tag:
                misclassified[postag] = misclassified.get(postag, 0) + 1
            count += 1

    print ('tag\tpos\t\tfreq\tfreq_tag\tcorrect\tincorrect\tmisclassified')

    for postag, _ in sorted(freq.items(), key=itemgetter(1), reverse=True):
        print ('{}{}{}{}{}{}{}'.format(lookup_tag + '\t',
                                       postag.upper() + ' ' * (25 - len(postag)),
                                       str(freq.get(postag, 0)) + '\t\t',
                                       str(freq_tag.get(postag, 0)) + '\t\t',
                                       str(correct.get(postag, 0)) + '\t\t',
                                       str(incorrect.get(postag, 0)) + '\t\t',
                                       str(misclassified.get(postag, 0))))


In [22]:
crf_train(dataset)

In [23]:
# read ReLi sentences with tags
dataset = list()
tree = etree.parse(CORPUS_TEST)

tokens_nodes = tree.xpath('.//tokens')
for tokens_node in tokens_nodes:
    sentence = list()
    for word_node in tokens_node:
        if word_node.get('opinion')=='true':
            tag = 'target'
        else:
            tag = '_'
        sentence.append((word_node.get('form').lower(),
                         word_node.get('postag'),
                         word_node.get('head'),
                         word_node.get('deprel'),
                         tag))
    if len(sentence) != 0:
        dataset.append(sentence)   

In [24]:
# change head number by the token under that position
for sentence in dataset:
    for index, item in enumerate(sentence):
        word, postag, head, deprel, tag = item
        head = int(head)
        if head > 0:
            head = sentence[int(head) - 1][0]
        else:
            head = 'ROOT'
        sentence[index] = ((word, postag, head, deprel, tag))    

In [27]:
testset = dataset
testset[:2]

[[('yum', 'NN', 'ROOT', 'root', '_'), ('!', '.', 'yum', 'punct', '_')],
 [('serves', 'VBZ', 'ROOT', 'root', '_'),
  ('really', 'RB', 'sushi', 'advmod', '_'),
  ('good', 'JJ', 'sushi', 'amod', '_'),
  ('sushi', 'NN', 'serves', 'dobj', 'target'),
  ('.', '.', 'serves', 'punct', '_')]]

In [28]:
tagger = Tagger()
tagger.open('model.crfsuite')

X_test = ItemSequence(extract_features(testset))
y_pred = tagger.tag(X_test)
y_true = extract_labels(testset)

reports(y_true, y_pred)

Accuracy: 93.08%

Classification Report:

             precision    recall  f1-score   support

          _       0.95      0.98      0.96      9015
     target       0.69      0.47      0.56       928

avg / total       0.92      0.93      0.92      9943

       |         t |
       |         a |
       |         r |
       |         g |
       |         e |
       |    _    t |
-------+-----------+
     _ |<8820> 195 |
target |  493 <435>|
-------+-----------+
(row = reference; col = test)



In [29]:
tagger_reports(tagger)


Top likely transitions:
_        -> _        1.2930
target   -> target   -0.2272
target   -> _        -1.1158
_        -> target   -1.1179

Top unlikely transitions:
_        -> _        1.2930
target   -> target   -0.2272
target   -> _        -1.1158
_        -> target   -1.1179

Top positive:
2.944413 target word:decor
2.255491 target word:ambience
2.168018 target word:service
2.130844 target postag:NNS
1.963599 target word:staff
1.887555 _      word:great
1.873808 target postag:NN
1.821644 _      postag:.
1.807634 target word:view
1.770234 target postag:NNP
1.763494 _      word:is
1.674473 target word:portions
1.629019 target word:atmosphere
1.587426 target word:food
1.569808 target head:skiline
1.518295 target word:waiter
1.439020 _      head:restaurants
1.414374 target postag:NNPS
1.367203 _      word:restaurants
1.359148 _      word:quality

Top negative:
-0.952573 _      word:dessert
-0.968065 _      word:meal
-0.977329 _      word:japanese
-1.002008 _      word:indian
-1.01569