# Config

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format="retina"
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

import os
import xmltodict
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import spacy

from lib.loader import load_data, iterate_sentence
from lib.ml import add_feature

pd.set_option("display.max_columns", 100)

DATA_DIR = "./data"
TRAINING_DATA = os.path.join(DATA_DIR, "Laptops_Train_v2.xml")
TESTING_DATA = os.path.join(DATA_DIR, "Laptops_Test_Gold.xml")

In [3]:
nlp = spacy.load('en_core_web_lg')

import regex as re
from spacy.tokenizer import Tokenizer

# this regex is taken from NLTK's WordPunctTokenizer
infix_re = re.compile(r'\w+|[^\w\s]+')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)

nlp.tokenizer = custom_tokenizer(nlp)

# Preparing data

In [4]:
train_full_info = list(iterate_sentence(TRAINING_DATA, nlp))
test_full_info = list(iterate_sentence(TESTING_DATA, nlp))
train_sents = [list(zip(s[0], s[4])) for s in train_full_info]
test_sents = [list(zip(s[0], s[4])) for s in test_full_info]

In [38]:
def word_to_features(token, sent, no_embedding=False):
    loc = token.i
    sent_token_sum = len(sent)
    
    
    contain_upper_fn = lambda t: int(len([c for c in t.text if c.isupper()])!=0)
    
    # current token feature
    features = {
        "bias": 1.0,
        "relative_loc": loc/float(sent_token_sum),
        "len": len(token.text),
        "pos": token.pos_,
        "detailed_pos": token.tag_,
        "dep": token.dep_,
        "vector_l2_norm": token.vector_norm, 
        "like_num": int(token.like_num),
        "is_quote": int(token.is_quote),
        "is_head": int(token.head.text == token.text),
        "is_alpha": int(token.is_alpha),
        "is_digit": int(token.is_alpha),
        "is_contain_upper": contain_upper_fn(token),
        "is_punct": int(token.is_punct),
        "is_end": int(loc == sent_token_sum-1),
        "is_start": int(loc == 0)
    }
    
    # previous token feature
    if loc > 0:
        prev_token = sent[loc-1][0]
        features["prev:pos"] = prev_token.pos_
        features["prev:dep"] = prev_token.dep_
        features["prev:like_num"] = prev_token.like_num
        features["prev:is_quote"] = prev_token.is_quote
        features["prev:is_head"] = int(prev_token.head.text == prev_token.text)
        features["prev:is_contain_upper"] = contain_upper_fn(prev_token)
        features["prev:is_punct"] = prev_token.is_punct
    
    # next token feature
    if loc != sent_token_sum-1:
        next_token = sent[loc+1][0]
        features["next:pos"] = next_token.pos_
        features["next:dep"] = next_token.dep_
        features["next:like_num"] = next_token.like_num
        features["next:is_quote"] = next_token.is_quote
        features["next:is_head"] = int(next_token.head.text == next_token.text)
        features["next:is_contain_upper"] = contain_upper_fn(next_token)
        features["next:is_punct"] = next_token.is_punct
    
    if no_embedding:
        features["lemma"] = token.lemma_
    else:
        for n, dim_val in enumerate(token.vector):
            features["vector-dim-{}".format(n)] = dim_val
    return features

def sent_to_features(sent, no_embedding=False):
    return [word_to_features(token, sent, no_embedding) for token, label in sent]

def sent_to_labels(sent):
    return [label for token, label in sent]

def sent_to_tokens(sent):
    return [token for token, label in sent]

In [6]:
X_train = [sent_to_features(s) for s in train_sents]
y_train = [sent_to_labels(s) for s in train_sents]

X_test = [sent_to_features(s) for s in test_sents]
y_test = [sent_to_labels(s) for s in test_sents]

# Conditional Ransom Field

In [7]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

*** Number of features ***

In [8]:
all_features = set([state for (state, attr) in crf.state_features_])
len(all_features)

441

### Top features

In [9]:
from collections import Counter

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

In [10]:
from collections import Counter

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(100))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[100:])

Top positive:
3.668192 O        detailed_pos:,
3.546590 O        dep:aux
3.155423 O        is_punct
3.123099 O        detailed_pos:JJR
2.803767 O        pos:ADV
2.698514 I        detailed_pos:RP
2.596708 O        pos:PRON
2.596708 O        detailed_pos:PRP
2.442049 O        dep:advmod
2.377897 O        detailed_pos:WDT
2.198518 O        next:dep:csubj
2.149359 O        prev:dep:dative
2.142031 O        detailed_pos:JJS
2.126637 O        dep:acomp
2.085285 B        prev:dep:oprd
2.061820 I        prev:pos:PRON
2.029132 O        prev:dep:relcl
1.854688 I        prev:dep:compound
1.851937 O        detailed_pos:$
1.850545 O        detailed_pos:TO
1.805067 B        detailed_pos:AFX
1.780178 O        detailed_pos:-RRB-
1.727651 O        pos:SPACE
1.727651 O        detailed_pos:_SP
1.727651 O        dep:
1.712839 O        dep:cc
1.659597 I        prev:pos:ADV
1.648031 O        detailed_pos::
1.621465 O        pos:X
1.605263 I        next:dep:expl
1.556973 O        vector-dim-31
1.469749 O    

## Evaluation

In [11]:
y_pred = crf.predict(X_test)

In [12]:
def evaluation(y_test, y_pred, target_label):
    def flatten_y(y):
        y_output = set()
        for n_sentence, y_sentence in enumerate(y):
            for n_token, y_token in enumerate(y_sentence):
                if y_token in target_label:
                    y_output.add("{}-{}".format(n_sentence, n_token))
        return y_output
    
    y_test_set = flatten_y(y_test)
    y_pred_set = flatten_y(y_pred)
    y_intersect = y_test_set.intersection(y_pred_set)
    
    precision = len(y_intersect)/ len(y_pred_set) * 100
    recall = len(y_intersect)/ len(y_test_set) * 100
    f1 = 2*precision*recall / (precision+recall)
    
    return precision, recall, f1, len(y_test_set)

def get_result(y_test, y_pred, target_labels=["B", "I"]):
    result_raw = {
        "label": [],
        "precision": [],
        "recall": [],
        "f1": [],
        "support": []
    }
    for label in target_labels+[target_labels]:
        if type(label) is str:
            precision, recall, f1, support = evaluation(y_test, y_pred, [label])
        else:
            precision, recall, f1, support = evaluation(y_test, y_pred, label)
        result_raw["label"].append(label)
        result_raw["precision"].append(precision)
        result_raw["recall"].append(recall)
        result_raw["f1"].append(f1)
        result_raw["support"].append(support)
    return pd.DataFrame(result_raw)[["label", "precision", "recall", "f1", "support"]].set_index("label")
                

In [13]:
def extract_term(tokens, labels, verbose=False):
    if len(tokens) != len(labels):
        raise ValueError("Size of label and token mismatch! {} Vs {}".format(len(tokens), len(labels)))
    aspect_terms = []
    curr_term = ""
    if verbose:
        print("\t", tokens)
        print("\t", labels)
    for n in range(len(tokens)):
        curr_label = labels[n]
        curr_tokens = tokens[n]
        curr_text = curr_tokens.text
            
        if curr_label == "B" or curr_label == "I":
            curr_term += " " + curr_text
        else:
            if len(curr_term) > 0:
                aspect_terms.append(curr_term.strip())
                curr_term = ""
    return aspect_terms

In [14]:
get_result(y_test=y_test, y_pred=y_pred, target_labels=["B", "I"])

Unnamed: 0_level_0,precision,recall,f1,support
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B,83.925234,68.759571,75.589226,653
I,86.337209,63.59743,73.24291,467
"[B, I]",89.761092,70.446429,78.93947,1120


### Confirm result with function from sklearn_crfsuite

In [15]:
from sklearn_crfsuite import metrics

print(metrics.flat_classification_report(
    y_test, y_pred, labels=["B", "I"], digits=5
))

             precision    recall  f1-score   support

          B    0.83925   0.68760   0.75589       653
          I    0.86337   0.63597   0.73243       467

avg / total    0.84931   0.66607   0.74611      1120



In [16]:
def to_bin_label(labels):
    for y_sent in labels:
        yield ["O" if y == "O" else "Target" for y in y_sent]
y_test_bin = list(to_bin_label(y_test))
y_pred_bin = list(to_bin_label(y_pred))
print(metrics.flat_classification_report(
    y_test_bin, y_pred_bin, labels=["Target"], digits=5
))

             precision    recall  f1-score   support

     Target    0.89761   0.70446   0.78939      1120

avg / total    0.89761   0.70446   0.78939      1120



# RandomForest
 - non-sequential classifier
 - scale invariant model

### Prepare data

In [17]:
from pycrfsuite import ItemSequence

In [18]:
FEATURE_INDEX = sorted(list(all_features))

In [19]:
def sent_to_vector(sent, feature_index):
    sent_features = ItemSequence(sent).items()
    output = []
    for token_feature in sent_features:
        output.append([token_feature.get(f, 0) for f in feature_index])
    return output

In [20]:
X_train_vec = [sent_to_vector(x, FEATURE_INDEX) for x in X_train]
X_test_vec = [sent_to_vector(x, FEATURE_INDEX) for x in X_test]

In [21]:
def flatten(sent):
    return [t for tokens in sent for t in tokens]

X_train_vec_flat = flatten(X_train_vec)
y_train_flat = flatten(y_train)

X_test_vec_flat = flatten(X_test_vec)

### Train model

In [22]:
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier()
RF_clf.fit(X_train_vec_flat, y_train_flat)
y_pred_RF_flat = RF_clf.predict(X_test_vec_flat)

In [23]:
def reshape_y(flat_list, ref_list):
    input_list = list(flat_list)
    output = []
    idx_from = 0
    for n, y_sent in enumerate(ref_list):
        idx_to = idx_from+len(y_sent)
        output.append(input_list[idx_from:idx_to])
        idx_from += len(output[-1])
    return output

In [26]:
RF_feature_importances = list(zip(RF_clf.feature_importances_, FEATURE_INDEX))
RF_feature_importances.sort(reverse=True)

In [27]:
for feature_name, importance in RF_feature_importances:
    print(feature_name, importance)

0.0522828902376 relative_loc
0.0260680652243 vector-dim-17
0.0246013675939 prev:dep:compound
0.0224860607237 vector_l2_norm
0.015329761612 vector-dim-268
0.012997556178 pos:NOUN
0.0127143628482 prev:pos:NOUN
0.0124479433648 vector-dim-31
0.0104465659439 vector-dim-85
0.01034813366 vector-dim-107
0.00991900647074 vector-dim-110
0.00967886490771 vector-dim-49
0.00947070041968 dep:compound
0.00916217615515 vector-dim-260
0.00826463572539 vector-dim-232
0.00795195893151 len
0.00793708168858 vector-dim-119
0.00793062594472 prev:pos:DET
0.00784465821257 next:pos:NOUN
0.00724711517317 vector-dim-194
0.00683393315939 vector-dim-10
0.0066262816767 vector-dim-150
0.00652152388343 vector-dim-69
0.00641674738197 vector-dim-279
0.006290154347 vector-dim-111
0.00628730653374 vector-dim-259
0.00624020280791 vector-dim-283
0.00613520535889 vector-dim-272
0.00611763657796 vector-dim-226
0.00608457686405 vector-dim-9
0.0060505561804 detailed_pos:NN
0.00592940774409 prev:dep:amod
0.00588415998069 prev:is

### Evaluation RandomForest

In [29]:
y_pred_RF = reshape_y(y_pred_RF_flat, y_test)
get_result(y_test=y_test, y_pred=y_pred_RF, target_labels=["B", "I"])

Unnamed: 0_level_0,precision,recall,f1,support
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B,75.344828,66.921899,70.884023,653
I,74.285714,38.972163,51.123596,467
"[B, I]",87.151515,64.196429,73.933162,1120


In [30]:
print(metrics.flat_classification_report(
    y_test, y_pred_RF, labels=["B", "I"], digits=5
))

             precision    recall  f1-score   support

          B    0.75345   0.66922   0.70884       653
          I    0.74286   0.38972   0.51124       467

avg / total    0.74903   0.55268   0.62645      1120



# CRF without Word Embedding

In [39]:
X_train_no_embedding = [sent_to_features(s, no_embedding=True) for s in train_sents]

In [45]:
crf_no_embedding = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    min_freq=10
)
crf_no_embedding.fit(X_train_no_embedding, y_train)
y_pred_no_embedding = crf_no_embedding.predict(X_test)
get_result(y_test=y_test, y_pred=y_pred_no_embedding, target_labels=["B", "I"])

Unnamed: 0_level_0,precision,recall,f1,support
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B,52.091255,20.980092,29.912664,653
I,57.201646,29.764454,39.15493,467
"[B, I]",55.731225,25.178571,34.686347,1120


In [48]:
print(metrics.flat_classification_report(
    y_test, y_pred_no_embedding, labels=["B", "I"], digits=5
))

             precision    recall  f1-score   support

          B    0.52091   0.20980   0.29913       653
          I    0.57202   0.29764   0.39155       467

avg / total    0.54222   0.24643   0.33766      1120



In [46]:
print("Top positive:")
print_state_features(Counter(crf_no_embedding.state_features_).most_common(100))

print("\nTop negative:")
print_state_features(Counter(crf_no_embedding.state_features_).most_common()[100:])

Top positive:
5.815385 O        lemma:laptop
5.305817 B        lemma:price
5.160203 B        lemma:software
4.924636 B        lemma:program
4.829737 I        lemma:application
4.639519 O        lemma:computer
4.628286 O        lemma:macbook
4.621138 B        lemma:gaming
4.599910 I        lemma:warranty
4.589468 B        lemma:feature
4.559310 O        is_end
4.387350 I        lemma:software
4.369957 B        lemma:cost
4.156876 B        lemma:keyboard
4.065465 O        lemma:problem
4.027238 B        lemma:speed
4.000846 B        lemma:warranty
3.998796 I        lemma:keyboard
3.942846 B        lemma:performance
3.877771 O        lemma:be
3.862964 B        lemma:motherboard
3.775243 B        lemma:size
3.764023 O        lemma:t
3.743164 B        lemma:build
3.699970 O        lemma:netbook
3.548030 I        lemma:program
3.508238 B        lemma:application
3.482124 B        lemma:battery
3.475038 O        lemma:notebook
3.452138 O        dep:aux
3.404039 B        lemma:touchpad
3.38405

In [44]:
len(set([state for (state, attr) in crf_no_embedding.state_features_]))

1923

In [47]:
len(set([state for (state, attr) in crf_no_embedding.state_features_]))

695