# Config

In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
%matplotlib inline
%config InlineBackend.figure_format="retina"
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

import os
import xmltodict
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import spacy

from lib.loader import load_data, iterate_sentence
from lib.ml import add_feature

pd.set_option("display.max_columns", 100)

DATA_DIR = "./data"
TRAINING_DATA = os.path.join(DATA_DIR, "Laptops_Train_v2.xml")
TESTING_DATA = os.path.join(DATA_DIR, "Laptops_Test_Gold.xml")

In [3]:
nlp = spacy.load('en_core_web_lg')

import regex as re
from spacy.tokenizer import Tokenizer

# this regex is taken from NLTK's WordPunctTokenizer
infix_re = re.compile(r'\w+|[^\w\s]+')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)

nlp.tokenizer = custom_tokenizer(nlp)

# Preparing data

In [44]:
def word_to_features(token, sent):
    loc = token.i
    sent_token_sum = len(sent)
    
    # current token feature
    features = {
        "bias": 1.0,
        "relative_loc": loc/float(sent_token_sum),
        "len": len(token.text),
        "pos": token.pos_,
        "detailed_pos": token.tag_,
        "dep": token.dep_,
        "vector_l2_norm": token.vector_norm, 
        "like_num": int(token.like_num),
        "is_quote": int(token.is_quote),
        "is_head": int(token.head.text == token.text),
        "is_alpha": int(token.is_alpha),
        "is_digit": int(token.is_alpha),
        "is_contain_upper": int(len([c for c in token.text if c.isupper()])!=0),
        "is_punct": int(token.is_punct),
        "is_end": int(loc == sent_token_sum-1)
    }
    
    for n, dim_val in enumerate(token.vector):
        features["vector-dim-{}".format(n)] = dim_val
    return features

def sent_to_features(sent):
    return [word_to_features(token, sent) for token, label in sent]

def sent_to_labels(sent):
    return [label for token, label in sent]

def sent_to_tokens(sent):
    return [token for token, label in sent]

In [23]:
train_full_info = list(iterate_sentence(TRAINING_DATA, nlp))
test_full_info = list(iterate_sentence(TESTING_DATA, nlp))

In [24]:
train_sents = [list(zip(s[0], s[4])) for s in train_full_info]
test_sents = [list(zip(s[0], s[4])) for s in test_full_info]

In [45]:
X_train = [sent_to_features(s) for s in train_sents]
y_train = [sent_to_labels(s) for s in train_sents]

X_test = [sent_to_features(s) for s in test_sents]
y_test = [sent_to_labels(s) for s in test_sents]

# Conditional Ransom Field

In [46]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

*** Number of features ***

In [155]:
all_features = set([state for (state, attr) in crf.state_features_])
len(all_features)

310

In [156]:
all_features

{'bias',
 'dep:',
 'dep:ROOT',
 'dep:acl',
 'dep:acomp',
 'dep:advcl',
 'dep:advmod',
 'dep:amod',
 'dep:appos',
 'dep:attr',
 'dep:aux',
 'dep:auxpass',
 'dep:case',
 'dep:cc',
 'dep:ccomp',
 'dep:compound',
 'dep:conj',
 'dep:csubj',
 'dep:dative',
 'dep:dep',
 'dep:det',
 'dep:dobj',
 'dep:intj',
 'dep:mark',
 'dep:meta',
 'dep:neg',
 'dep:nmod',
 'dep:npadvmod',
 'dep:nsubj',
 'dep:nsubjpass',
 'dep:nummod',
 'dep:oprd',
 'dep:parataxis',
 'dep:pcomp',
 'dep:pobj',
 'dep:poss',
 'dep:preconj',
 'dep:predet',
 'dep:prep',
 'dep:prt',
 'dep:punct',
 'dep:quantmod',
 'dep:relcl',
 'dep:xcomp',
 'detailed_pos:$',
 "detailed_pos:''",
 'detailed_pos:,',
 'detailed_pos:-LRB-',
 'detailed_pos:-RRB-',
 'detailed_pos:.',
 'detailed_pos::',
 'detailed_pos:AFX',
 'detailed_pos:CC',
 'detailed_pos:CD',
 'detailed_pos:DT',
 'detailed_pos:FW',
 'detailed_pos:HYPH',
 'detailed_pos:IN',
 'detailed_pos:JJ',
 'detailed_pos:JJR',
 'detailed_pos:JJS',
 'detailed_pos:LS',
 'detailed_pos:MD',
 'detailed_

### Top features

In [49]:
from collections import Counter

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(100))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[100:])

Top positive:
3.959809 O        dep:aux
3.455222 O        detailed_pos:,
3.300417 O        detailed_pos:JJR
3.276322 O        detailed_pos:TO
3.267580 O        is_end
3.000836 O        detailed_pos::
2.824390 O        is_punct
2.749453 O        detailed_pos:WDT
2.683999 O        pos:PRON
2.683999 O        detailed_pos:PRP
2.534575 O        dep:dative
2.507640 O        dep:csubj
2.260667 O        dep:poss
2.195689 O        pos:ADV
2.113279 O        dep:advmod
2.059393 O        detailed_pos:JJS
2.036945 B        detailed_pos:AFX
1.884319 O        dep:auxpass
1.877257 O        dep:intj
1.787139 O        dep:cc
1.785532 I        detailed_pos:HYPH
1.697047 O        detailed_pos:-RRB-
1.649155 O        dep:acomp
1.633747 I        vector-dim-112
1.530817 O        pos:CCONJ
1.530817 O        detailed_pos:CC
1.510473 O        detailed_pos:WP
1.507405 O        vector-dim-31
1.475837 B        vector-dim-158
1.409466 I        vector-dim-158
1.360726 O        detailed_pos:MD
1.327163 O        detai

## Evaluation

In [129]:
def evaluation(y_test, y_pred, target_label):
    def flatten_y(y):
        y_output = set()
        for n_sentence, y_sentence in enumerate(y):
            for n_token, y_token in enumerate(y_sentence):
                if y_token in target_label:
                    y_output.add("{}-{}".format(n_sentence, n_token))
        return y_output
    
    y_test_set = flatten_y(y_test)
    y_pred_set = flatten_y(y_pred)
    y_intersect = y_test_set.intersection(y_pred_set)
    
    precision = len(y_intersect)/ len(y_pred_set) * 100
    recall = len(y_intersect)/ len(y_test_set) * 100
    f1 = 2*precision*recall / (precision+recall)
    
    return precision, recall, f1, len(y_test_set)

def get_result(y_test, y_pred, target_labels=["B", "I"]):
    result_raw = {
        "label": [],
        "precision": [],
        "recall": [],
        "f1": [],
        "support": []
    }
    for label in target_labels+[target_labels]:
        if type(label) is str:
            precision, recall, f1, support = evaluation(y_test, y_pred, [label])
        else:
            precision, recall, f1, support = evaluation(y_test, y_pred, label)
        result_raw["label"].append(label)
        result_raw["precision"].append(precision)
        result_raw["recall"].append(recall)
        result_raw["f1"].append(f1)
        result_raw["support"].append(support)
    return pd.DataFrame(result_raw)[["label", "precision", "recall", "f1", "support"]].set_index("label")
                

In [130]:
def extract_term(tokens, labels, verbose=False):
    if len(tokens) != len(labels):
        raise ValueError("Size of label and token mismatch! {} Vs {}".format(len(tokens), len(labels)))
    aspect_terms = []
    curr_term = ""
    if verbose:
        print("\t", tokens)
        print("\t", labels)
    for n in range(len(tokens)):
        curr_label = labels[n]
        curr_tokens = tokens[n]
        curr_text = curr_tokens.text
            
        if curr_label == "B" or curr_label == "I":
            curr_term += " " + curr_text
        else:
            if len(curr_term) > 0:
                aspect_terms.append(curr_term.strip())
                curr_term = ""
    return aspect_terms

In [234]:
get_result(y_test=y_test, y_pred=y_pred, target_labels=["B", "I"])

Unnamed: 0_level_0,precision,recall,f1,support
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B,81.48855,65.390505,72.557349,653
I,85.060976,59.743041,70.188679,467
"[B, I]",90.140845,68.571429,77.890467,1120


### Confirm result with function from sklearn_crfsuite

In [132]:
from sklearn_crfsuite import metrics

print(metrics.flat_classification_report(
    y_test, y_pred, labels=["B", "I"], digits=5
))

             precision    recall  f1-score   support

          B    0.81489   0.65391   0.72557       653
          I    0.85061   0.59743   0.70189       467

avg / total    0.82978   0.63036   0.71570      1120



In [136]:
def to_bin_label(labels):
    for y_sent in labels:
        yield ["O" if y == "O" else "Target" for y in y_sent]
y_test_bin = list(to_bin_label(y_test))
y_pred_bin = list(to_bin_label(y_pred))
print(metrics.flat_classification_report(
    y_test_bin, y_pred_bin, labels=["Target"], digits=5
))

             precision    recall  f1-score   support

     Target    0.90141   0.68571   0.77890      1120

avg / total    0.90141   0.68571   0.77890      1120



# RandomForest
 - non-sequential classifier
 - scale invariant model

### Prepare data

In [139]:
from pycrfsuite import ItemSequence

In [162]:
FEATURE_INDEX = sorted(list(all_features))

In [166]:
def sent_to_vector(sent, feature_index):
    sent_features = ItemSequence(sent).items()
    output = []
    for token_feature in sent_features:
        output.append([token_feature.get(f, 0) for f in feature_index])
    return output

In [167]:
X_train_vec = [sent_to_vector(x, FEATURE_INDEX) for x in X_train]
X_test_vec = [sent_to_vector(x, FEATURE_INDEX) for x in X_test]

In [176]:
def flatten(sent):
    return [t for tokens in sent for t in tokens]

X_train_vec_flat = flatten(X_train_vec)
y_train_flat = flatten(y_train)

X_test_vec_flat = flatten(X_test_vec)

### Train model

In [236]:
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier()
RF_clf.fit(X_train_vec_flat, y_train_flat)
y_pred_RF_flat = RF_clf.predict(X_test_vec_flat)

In [237]:
def reshape_y(flat_list, ref_list):
    input_list = list(flat_list)
    output = []
    idx_from = 0
    for n, y_sent in enumerate(ref_list):
        idx_to = idx_from+len(y_sent)
        output.append(input_list[idx_from:idx_to])
        idx_from += len(output[-1])
    return output

In [238]:
y_pred_RF = reshape_y(y_pred_RF_flat, y_test)

In [239]:
get_result(y_test=y_test, y_pred=y_pred_RF, target_labels=["B", "I"])

Unnamed: 0_level_0,precision,recall,f1,support
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B,70.818505,60.949464,65.514403,653
I,54.471545,28.69379,37.587658,467
"[B, I]",84.405941,60.892857,70.746888,1120


In [270]:
RF_feature_importances = list(zip(RF_clf.feature_importances_, FEATURE_INDEX))
RF_feature_importances.sort(reverse=True)

In [271]:
for feature_name, importance in RF_feature_importances:
    print(feature_name, importance)

0.178524165811 relative_loc
0.023724291537 vector_l2_norm
0.0164847582413 dep:compound
0.0156490337518 vector-dim-161
0.0155782402457 detailed_pos:NN
0.0152483005732 vector-dim-17
0.0123541990703 pos:NOUN
0.011960476462 dep:pobj
0.0113115992217 vector-dim-10
0.0112662122995 dep:dobj
0.0106356740226 dep:nsubj
0.0106315703843 vector-dim-87
0.00983142424017 vector-dim-9
0.00931832181505 vector-dim-135
0.00929265780586 vector-dim-279
0.00921060495017 vector-dim-268
0.00874426870439 vector-dim-85
0.00799131567236 vector-dim-111
0.00782130441651 vector-dim-31
0.00755132922241 vector-dim-69
0.00684740597434 vector-dim-110
0.0068223288995 vector-dim-229
0.00680280215403 vector-dim-282
0.00662562970732 vector-dim-166
0.00659636243889 dep:conj
0.00656544125568 vector-dim-235
0.00615986475096 vector-dim-29
0.00607989314314 vector-dim-8
0.00606419409652 vector-dim-259
0.00601883829087 vector-dim-49
0.0058752908137 vector-dim-128
0.00581713613827 vector-dim-287
0.00576011294294 vector-dim-242
0.005