In [1]:
import json
import re
import numpy as np

In [2]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [y.strip() for y in string]

def to_title(string):
    if string.isupper():
        string = string.title()
    return string

In [3]:
with open('pos-data-v3.json','r') as fopen:
    dataset = json.load(fopen)

In [4]:
texts, labels = [], []
for i in dataset:
    try:
        texts.append(process_string(i[0])[0].lower())
        labels.append(i[-1])
    except Exception as e:
        pass

In [5]:
seq_len = 50
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

In [6]:
from nltk.tag.util import untag

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'prev_word-prefix-1': '' if index == 0 else sentence[index - 1][0],
        'prev_word-prefix-2': '' if index == 0 else sentence[index - 1][:2],
        'prev_word-prefix-3': '' if index == 0 else sentence[index - 1][:3],
        'prev_word-suffix-1': '' if index == 0 else sentence[index - 1][-1],
        'prev_word-suffix-2': '' if index == 0 else sentence[index - 1][-2:],
        'prev_word-suffix-3': '' if index == 0 else sentence[index - 1][-3:],
        'next_word-prefix-1': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'next_word-prefix-2': '' if index == len(sentence) - 1 else sentence[index + 1][:2],
        'next_word-prefix-3': '' if index == len(sentence) - 1 else sentence[index + 1][:3],
        'next_word-suffix-1': '' if index == len(sentence) - 1 else sentence[index + 1][-1],
        'next_word-suffix-2': '' if index == len(sentence) - 1 else sentence[index + 1][-2:],
        'next_word-suffix-3': '' if index == len(sentence) - 1 else sentence[index + 1][-3:],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
    }

def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
 
    return X, y

In [7]:
combined = list(map(lambda X: (X[0],X[1]), list(zip(texts, labels))))

In [8]:
combined_seq = to_train_seq(combined)[0]
combined_seq.shape

(103367, 50, 2)

In [9]:
X, Y = transform_to_dataset(combined_seq)

from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2)

In [10]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [11]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(train_X, train_Y)

CPU times: user 32min 22s, sys: 45.6 s, total: 33min 8s
Wall time: 33min 3s


In [12]:
labels = list(crf.classes_)
labels.remove('X')
labels

['PRON',
 'DET',
 'NOUN',
 'VERB',
 'PROPN',
 'ADP',
 'ADV',
 'CCONJ',
 'NUM',
 'ADJ',
 'PART',
 'AUX',
 'SCONJ',
 'SYM']

In [13]:
y_pred = crf.predict(test_X)
metrics.flat_f1_score(test_Y, y_pred,
                      average='weighted', labels = labels)

0.9917356367386992

In [14]:
print(metrics.flat_classification_report(
    test_Y, y_pred, labels=labels, digits=3
))

              precision    recall  f1-score   support

        PRON      0.998     0.997     0.998     47911
         DET      0.990     0.993     0.991     39932
        NOUN      0.988     0.988     0.988    270045
        VERB      0.997     0.997     0.997    122015
       PROPN      0.989     0.988     0.988    225893
         ADP      0.997     0.998     0.997    120358
         ADV      0.992     0.991     0.991     47753
       CCONJ      0.997     0.998     0.997     36696
         NUM      0.993     0.995     0.994     43748
         ADJ      0.985     0.988     0.986     45244
        PART      0.992     0.995     0.993      5975
         AUX      1.000     1.000     1.000     10505
       SCONJ      0.994     0.987     0.990     14798
         SYM      0.998     0.997     0.998      2483

   micro avg      0.992     0.992     0.992   1033356
   macro avg      0.994     0.994     0.994   1033356
weighted avg      0.992     0.992     0.992   1033356



In [15]:
import pickle

with open('crf-pos.pkl','wb') as fopen:
    pickle.dump(crf,fopen)