In [None]:
import pandas as pd
import numpy as np
import json
import pickle
import argparse
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from os.path import join, split, splitext

In [None]:
dataset = "medium"
model = "medium"

In [None]:
print("loading dataset....")
train = pd.read_csv("../train_{}.csv".format(dataset),
                    usecols=["document_type", "pages", "process_id"])
valid = pd.read_csv("../validation_{}.csv".format(dataset),
                    usecols=["document_type", "pages", "process_id"])
test = pd.read_csv("../test_parts_{}.csv".format(dataset),
                   usecols=["document_type", "pages", "process_id"])
train["document_type"] = train.apply(lambda x: "B-" + x["document_type"] if x["pages"] == 1 else "I-" + x["document_type"],
                                     axis=1)
valid["document_type"] = valid.apply(lambda x: "B-" + x["document_type"] if x["pages"] == 1 else "I-" + x["document_type"],
                                     axis=1)
test["document_type"] = test.apply(lambda x: "B-" + x["document_type"] if x["pages"] == 1 else "I-" + x["document_type"],
                                   axis=1)


with open("../document_vectors_{}_{}.pkl".format(model, dataset), "rb") as file:
    vectors = pickle.load(file)
X_train = vectors["train_vectors"]
X_valid = vectors["valid_vectors"]
X_test = vectors["test_vectors"]
y_train = train["document_type"].tolist()
y_valid = valid["document_type"].tolist()
y_test = test["document_type"].tolist()
print("....done.")

In [None]:
def data_to_process(data, vectors):
    xs = []
    ys = []
    data["data"] = vectors.tolist()
    for k, v in data.groupby("process_id").groups.items():
        xs.append(data.iloc[v]["data"].tolist())
        ys.append(data.iloc[v]["document_type"].tolist())
    return xs, ys

In [None]:
print("Converting data to sentences...")
X_train, y_train = data_to_process(train, X_train)
X_valid, y_valid = data_to_process(valid, X_valid)
X_test, y_test = data_to_process(test, X_test)
print("...done!")

In [None]:
len(X_train), len(y_train)

In [None]:
def data2feat(data):
    feat_data = []
    for i, sentence in enumerate(data):
        feat_data.append([])
        for j, token in enumerate(sentence):
            feat_data[i].append({ str(i) : d for i, d in enumerate(token)})
    return feat_data

In [None]:
X_train = data2feat(X_train)
X_valid = data2feat(X_valid)
X_test = data2feat(X_test)

In [None]:
crf = sklearn_crfsuite.CRF(
    verbose=True,
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    all_possible_states=True
)
crf.fit(X_train, y_train)

In [None]:
labels = crf.classes_

In [None]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))
print(metrics.flat_accuracy_score(y_test, y_pred))

In [None]:
import scipy
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV


# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True,
    all_possible_states=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_valid, y_valid)

In [None]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')


_x = [s.parameters['c1'] for s in rs.grid_scores_]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

In [None]:
crf = sklearn_crfsuite.CRF(
    verbose=True,
    algorithm='lbfgs',
    c1=rs.best_params_['c1'],
    c2=rs.best_params_['c2'],
    max_iterations=1000,
    all_possible_transitions=True,
    all_possible_states=True
)
crf.fit(X_train, y_train)

In [None]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))
print(metrics.flat_accuracy_score(y_test, y_pred))

In [None]:
y_pred_class = []
y_test_class = []

for i, sequence in enumerate(y_pred):
    y_pred_class.append([])
    for j, pred in enumerate(sequence):
        y_pred_class[i].append(pred[2:])
        
for i, sequence in enumerate(y_test):
    y_test_class.append([])
    for j, pred in enumerate(sequence):
        y_test_class[i].append(pred[2:])

In [None]:
y_pred_class[0][0], y_test_class[0][0]

In [None]:
sorted_labels=['acordao_de_2_instancia', 'agravo_em_recurso_extraordinario',
                'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca']

print(metrics.flat_classification_report(
    y_test_class, y_pred_class, labels=sorted_labels, digits=3
))
print(metrics.flat_accuracy_score(y_test_class, y_pred_class))

In [None]:
import pickle

with open("models/crf_{}_{}".format(model, dataset), "wb") as file:
    pickle.dump(crf, file)

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, sorted_labels[int(attr)]))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])