# Config

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format="retina"
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

import os
import xmltodict
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import spacy

from lib.loader import load_data, iterate_sentence
from lib.ml import add_feature

pd.set_option("display.max_columns", 100)

DATA_DIR = "./data"
TRAINING_DATA = os.path.join(DATA_DIR, "Laptops_Train_v2.xml")
TESTING_DATA = os.path.join(DATA_DIR, "Laptops_Test_Gold.xml")

In [3]:
nlp = spacy.load('en_core_web_lg')

import regex as re
from spacy.tokenizer import Tokenizer

# this regex is taken from NLTK's WordPunctTokenizer
infix_re = re.compile(r'\w+|[^\w\s]+')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)

nlp.tokenizer = custom_tokenizer(nlp)

# Preparing data

In [4]:
train_full_info = list(iterate_sentence(TRAINING_DATA, nlp))
test_full_info = list(iterate_sentence(TESTING_DATA, nlp))
train_sents = [list(zip(s[0], s[4])) for s in train_full_info]
test_sents = [list(zip(s[0], s[4])) for s in test_full_info]

In [59]:
def word_to_features(token, sent, no_embedding=False):
    loc = token.i
    sent_token_sum = len(sent)
    
    
    contain_upper_fn = lambda t: int(len([c for c in t.text if c.isupper()])!=0)
    
    # current token feature
    features = {
        "bias": 1.0,
        "relative_loc": loc/float(sent_token_sum),
        "len": len(token.text),
        "pos": token.pos_,
        "detailed_pos": token.tag_,
        "dep": token.dep_,
        "vector_l2_norm": token.vector_norm, 
        "like_num": int(token.like_num),
        "is_quote": int(token.is_quote),
        "is_head": int(token.head.text == token.text),
        "is_alpha": int(token.is_alpha),
        "is_digit": int(token.is_alpha),
        "is_contain_upper": contain_upper_fn(token),
        "is_punct": int(token.is_punct),
        "is_end": int(loc == sent_token_sum-1),
        "is_start": int(loc == 0)
    }
    
    # previous token feature
    if loc > 0:
        prev_token = sent[loc-1][0]
        features["prev:pos"] = prev_token.pos_
        features["prev:dep"] = prev_token.dep_
        features["prev:like_num"] = prev_token.like_num
        features["prev:is_quote"] = prev_token.is_quote
        features["prev:is_head"] = int(prev_token.head.text == prev_token.text)
        features["prev:is_contain_upper"] = contain_upper_fn(prev_token)
        features["prev:is_punct"] = prev_token.is_punct
    
    # next token feature
    if loc != sent_token_sum-1:
        next_token = sent[loc+1][0]
        features["next:pos"] = next_token.pos_
        features["next:dep"] = next_token.dep_
        features["next:like_num"] = next_token.like_num
        features["next:is_quote"] = next_token.is_quote
        features["next:is_head"] = int(next_token.head.text == next_token.text)
        features["next:is_contain_upper"] = contain_upper_fn(next_token)
        features["next:is_punct"] = next_token.is_punct
    
    if no_embedding:
        features["lemma"] = token.lemma_
    else:
        for n, dim_val in enumerate(token.vector):
            features["vector-dim-{}".format(n)] = dim_val
    return features

def sent_to_features(sent, no_embedding=False):
    return [word_to_features(token, sent, no_embedding) for token, label in sent]

def sent_to_labels(sent):
    return [label for token, label in sent]

def sent_to_tokens(sent):
    return [token for token, label in sent]

In [60]:
X_train = [sent_to_features(s) for s in train_sents]
y_train = [sent_to_labels(s) for s in train_sents]

X_test = [sent_to_features(s) for s in test_sents]
y_test = [sent_to_labels(s) for s in test_sents]

# Conditional Ransom Field

In [61]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

*** Number of features ***

In [62]:
all_features = set([state for (state, attr) in crf.state_features_])
len(all_features)

441

### Top features

In [63]:
from collections import Counter

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

In [109]:
from collections import Counter

def print_state_features(state_features, expected_label=None):
    for (attr, label), weight in state_features:
        if expected_label is None or label == expected_label:
            print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(5))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-5:])

Top positive:
3.668192 O        detailed_pos:,
3.546590 O        dep:aux
3.155423 O        is_punct
3.123099 O        detailed_pos:JJR
2.803767 O        pos:ADV

Top negative:
-1.764894 B        prev:dep:pobj
-1.844169 B        next:dep:poss
-1.885828 B        dep:prep
-2.902181 I        detailed_pos:VBZ
-5.505717 I        is_start


#### Top OUTER Features

In [110]:
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(5), expected_label="O")

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:], expected_label="O")

Top positive:
3.668192 O        detailed_pos:,
3.546590 O        dep:aux
3.155423 O        is_punct
3.123099 O        detailed_pos:JJR
2.803767 O        pos:ADV

Top negative:
-1.699270 O        prev:dep:oprd


#### Top BEGIN Features 

In [111]:
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(50), expected_label="B")

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:], expected_label="B")

Top positive:
2.085285 B        prev:dep:oprd
1.805067 B        detailed_pos:AFX
1.408623 B        vector-dim-158
1.288349 B        prev:dep:relcl
1.240862 B        prev:dep:csubj
1.139142 B        vector-dim-235
1.096165 B        prev:pos:INTJ

Top negative:
-1.604910 B        prev:is_quote
-1.624057 B        dep:punct
-1.632893 B        pos:PUNCT
-1.764894 B        prev:dep:pobj
-1.844169 B        next:dep:poss
-1.885828 B        dep:prep


#### Top INNER Features 

In [112]:
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(50), expected_label="I")

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:], expected_label="I")

Top positive:
2.698514 I        detailed_pos:RP
2.061820 I        prev:pos:PRON
1.854688 I        prev:dep:compound
1.659597 I        prev:pos:ADV
1.605263 I        next:dep:expl
1.308960 I        detailed_pos:VB
1.125533 I        vector-dim-158
1.123998 I        vector-dim-119

Top negative:
-1.664534 I        is_end
-2.902181 I        detailed_pos:VBZ
-5.505717 I        is_start


## Evaluation

In [65]:
y_pred = crf.predict(X_test)

In [66]:
def evaluation(y_test, y_pred, target_label):
    def flatten_y(y):
        y_output = set()
        for n_sentence, y_sentence in enumerate(y):
            for n_token, y_token in enumerate(y_sentence):
                if y_token in target_label:
                    y_output.add("{}-{}".format(n_sentence, n_token))
        return y_output
    
    y_test_set = flatten_y(y_test)
    y_pred_set = flatten_y(y_pred)
    y_intersect = y_test_set.intersection(y_pred_set)
    
    precision = len(y_intersect)/ len(y_pred_set) * 100
    recall = len(y_intersect)/ len(y_test_set) * 100
    f1 = 2*precision*recall / (precision+recall)
    
    return precision, recall, f1, len(y_test_set)

def get_result(y_test, y_pred, target_labels=["B", "I"]):
    result_raw = {
        "label": [],
        "precision": [],
        "recall": [],
        "f1": [],
        "support": []
    }
    for label in target_labels+[target_labels]:
        if type(label) is str:
            precision, recall, f1, support = evaluation(y_test, y_pred, [label])
        else:
            precision, recall, f1, support = evaluation(y_test, y_pred, label)
        result_raw["label"].append(label)
        result_raw["precision"].append(precision)
        result_raw["recall"].append(recall)
        result_raw["f1"].append(f1)
        result_raw["support"].append(support)
    return pd.DataFrame(result_raw)[["label", "precision", "recall", "f1", "support"]].set_index("label")
                

In [67]:
def extract_term(tokens, labels, verbose=False):
    if len(tokens) != len(labels):
        raise ValueError("Size of label and token mismatch! {} Vs {}".format(len(tokens), len(labels)))
    aspect_terms = []
    curr_term = ""
    if verbose:
        print("\t", tokens)
        print("\t", labels)
    for n in range(len(tokens)):
        curr_label = labels[n]
        curr_tokens = tokens[n]
        curr_text = curr_tokens.text
            
        if curr_label == "B" or curr_label == "I":
            curr_term += " " + curr_text
        else:
            if len(curr_term) > 0:
                aspect_terms.append(curr_term.strip())
                curr_term = ""
    return aspect_terms

In [68]:
get_result(y_test=y_test, y_pred=y_pred, target_labels=["B", "I"])

Unnamed: 0_level_0,precision,recall,f1,support
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B,83.925234,68.759571,75.589226,653
I,86.337209,63.59743,73.24291,467
"[B, I]",89.761092,70.446429,78.93947,1120


### Confirm result with function from sklearn_crfsuite

In [69]:
from sklearn_crfsuite import metrics

print(metrics.flat_classification_report(
    y_test, y_pred, labels=["B", "I"], digits=5
))

             precision    recall  f1-score   support

          B    0.83925   0.68760   0.75589       653
          I    0.86337   0.63597   0.73243       467

avg / total    0.84931   0.66607   0.74611      1120



In [70]:
def to_bin_label(labels):
    for y_sent in labels:
        yield ["O" if y == "O" else "Target" for y in y_sent]
y_test_bin = list(to_bin_label(y_test))
y_pred_bin = list(to_bin_label(y_pred))
print(metrics.flat_classification_report(
    y_test_bin, y_pred_bin, labels=["Target"], digits=5
))

             precision    recall  f1-score   support

     Target    0.89761   0.70446   0.78939      1120

avg / total    0.89761   0.70446   0.78939      1120



# RandomForest
 - non-sequential classifier
 - scale invariant model

### Prepare data

In [71]:
from pycrfsuite import ItemSequence

In [72]:
FEATURE_INDEX = sorted(list(all_features))

In [73]:
def sent_to_vector(sent, feature_index):
    sent_features = ItemSequence(sent).items()
    output = []
    for token_feature in sent_features:
        output.append([token_feature.get(f, 0) for f in feature_index])
    return output

In [74]:
X_train_vec = [sent_to_vector(x, FEATURE_INDEX) for x in X_train]
X_test_vec = [sent_to_vector(x, FEATURE_INDEX) for x in X_test]

In [75]:
def flatten(sent):
    return [t for tokens in sent for t in tokens]

X_train_vec_flat = flatten(X_train_vec)
y_train_flat = flatten(y_train)

X_test_vec_flat = flatten(X_test_vec)

### Train model

In [76]:
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier()
RF_clf.fit(X_train_vec_flat, y_train_flat)
y_pred_RF_flat = RF_clf.predict(X_test_vec_flat)

In [77]:
def reshape_y(flat_list, ref_list):
    input_list = list(flat_list)
    output = []
    idx_from = 0
    for n, y_sent in enumerate(ref_list):
        idx_to = idx_from+len(y_sent)
        output.append(input_list[idx_from:idx_to])
        idx_from += len(output[-1])
    return output

In [78]:
RF_feature_importances = list(zip(RF_clf.feature_importances_, FEATURE_INDEX))
RF_feature_importances.sort(reverse=True)

In [113]:
for feature_name, importance in RF_feature_importances[:10]:
    print(feature_name, importance)

0.0533708662614 relative_loc
0.0391753269525 vector-dim-17
0.0254787031233 prev:dep:compound
0.0217325515914 vector-dim-10
0.0182564627003 pos:NOUN
0.0158516118273 vector_l2_norm
0.0144110452866 vector-dim-232
0.0119619876673 vector-dim-9
0.011783297662 prev:pos:NOUN
0.00924481714129 vector-dim-99


### Evaluation RandomForest

In [80]:
y_pred_RF = reshape_y(y_pred_RF_flat, y_test)
get_result(y_test=y_test, y_pred=y_pred_RF, target_labels=["B", "I"])

Unnamed: 0_level_0,precision,recall,f1,support
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B,75.951557,67.228178,71.324127,653
I,77.542373,39.186296,52.062589,467
"[B, I]",88.083538,64.017857,74.146846,1120


In [81]:
print(metrics.flat_classification_report(
    y_test, y_pred_RF, labels=["B", "I"], digits=5
))

             precision    recall  f1-score   support

          B    0.75952   0.67228   0.71324       653
          I    0.77542   0.39186   0.52063       467

avg / total    0.76615   0.55536   0.63293      1120



# CRF without Word Embedding

In [82]:
X_train_no_embedding = [sent_to_features(s, no_embedding=True) for s in train_sents]

In [83]:
crf_no_embedding = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    min_freq=10
)
crf_no_embedding.fit(X_train_no_embedding, y_train)
y_pred_no_embedding = crf_no_embedding.predict(X_test)
get_result(y_test=y_test, y_pred=y_pred_no_embedding, target_labels=["B", "I"])

Unnamed: 0_level_0,precision,recall,f1,support
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B,52.091255,20.980092,29.912664,653
I,57.201646,29.764454,39.15493,467
"[B, I]",55.731225,25.178571,34.686347,1120


In [84]:
print(metrics.flat_classification_report(
    y_test, y_pred_no_embedding, labels=["B", "I"], digits=5
))

             precision    recall  f1-score   support

          B    0.52091   0.20980   0.29913       653
          I    0.57202   0.29764   0.39155       467

avg / total    0.54222   0.24643   0.33766      1120



In [116]:
print("Top positive:")
print_state_features(Counter(crf_no_embedding.state_features_).most_common(5))

print("\nTop negative:")
print_state_features(Counter(crf_no_embedding.state_features_).most_common()[-5:])

Top positive:
5.815385 O        lemma:laptop
5.305817 B        lemma:price
5.160203 B        lemma:software
4.924636 B        lemma:program
4.829737 I        lemma:application

Top negative:
-1.866842 O        lemma:shut
-1.880739 O        lemma:charge
-2.151915 O        lemma:load
-2.492014 O        lemma:carry
-2.828713 O        lemma:ship
