In [1]:
import json
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB



In [2]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [y.strip() for y in string]

In [3]:
with open('pos-data-v3.json','r') as fopen:
    dataset = json.load(fopen)

In [4]:
texts, labels = [], []
for i in dataset:
    try:
        texts.append(process_string(i[0])[0].lower())
        labels.append(i[-1])
    except Exception as e:
        print(e, i)

list index out of range ['%', '%', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ['*', '*', 'SYM']
list index out of range ['뭘봐', '뭘봐', 'PROPN']
list index out of range ['%', '%', 'SYM']
list index out of range ['ひ', 'ひ', 'PROPN']
list index out of range ['ヒ', 'ヒ', 'PROPN']
list index out of range ['形聲', '形聲', 'NOUN']
list index out of range ['°', '°', 'SYM']
list index out of range ['汉', '汉', 'PROPN']
list index out of range ['东', '东', 'PROPN']
list index out of range ['王', '王', 'PROPN']
list index out of range ['（', '（', 'PROPN']
list index out of range ['伊', '伊', 'PROPN']
list index out of range ['）', '）', 'PROPN']
list index out of range ['ȝ', 'ȝ', 'PROPN']
list index out of range ['%', '%', 'SYM']
list index out of range ['°', '°', 'SYM']
list index out of range ['%', '%', 'SYM']
list index out of range ["'", '_', 'PROPN']
list index out of range ['碁', '碁', 'NOUN']
list index out of range ['囲碁', '囲碁', 'NOUN']
list index out of range ['*', '*', 'SYM']
lis

In [5]:
{no:i for no, i in enumerate(np.unique(labels))}

{0: 'ADJ',
 1: 'ADP',
 2: 'ADV',
 3: 'AUX',
 4: 'CCONJ',
 5: 'DET',
 6: 'NOUN',
 7: 'NUM',
 8: 'PART',
 9: 'PRON',
 10: 'PROPN',
 11: 'SCONJ',
 12: 'SYM',
 13: 'VERB',
 14: 'X'}

In [6]:
target = LabelEncoder().fit_transform(labels)
bow_chars = CountVectorizer(ngram_range=(2, 4), analyzer='char').fit(texts)
vectors = bow_chars.transform(texts)
vectors.shape

(103417, 28374)

In [7]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.1)
del vectors

In [8]:
import xgboost as xgb

In [9]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 14,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': len(np.unique(target)),
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent':True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 100000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=50)

[0]	validation-mlogloss:2.41931
Will train until validation-mlogloss hasn't improved in 100 rounds.
[50]	validation-mlogloss:0.766313
[100]	validation-mlogloss:0.643448
[150]	validation-mlogloss:0.599167
[200]	validation-mlogloss:0.575259
[250]	validation-mlogloss:0.55921
[300]	validation-mlogloss:0.548473
[350]	validation-mlogloss:0.540771
[400]	validation-mlogloss:0.534592
[450]	validation-mlogloss:0.529116
[500]	validation-mlogloss:0.524457
[550]	validation-mlogloss:0.520296
[600]	validation-mlogloss:0.516776
[650]	validation-mlogloss:0.514046
[700]	validation-mlogloss:0.511372
[750]	validation-mlogloss:0.509108
[800]	validation-mlogloss:0.506961
[850]	validation-mlogloss:0.504766
[900]	validation-mlogloss:0.502944
[950]	validation-mlogloss:0.50125
[1000]	validation-mlogloss:0.499914
[1050]	validation-mlogloss:0.498169
[1100]	validation-mlogloss:0.497037
[1150]	validation-mlogloss:0.495854
[1200]	validation-mlogloss:0.494597
[1250]	validation-mlogloss:0.493381
[1300]	validation-mlog

In [11]:
from sklearn import metrics
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = np.unique(labels)))

             precision    recall  f1-score   support

        ADJ       0.78      0.66      0.71       462
        ADP       0.94      0.95      0.95      1216
        ADV       0.84      0.85      0.84       460
        AUX       0.99      1.00      0.99        91
      CCONJ       0.99      0.93      0.96       343
        DET       0.94      0.94      0.94       401
       NOUN       0.80      0.82      0.81      2668
        NUM       0.77      0.93      0.84       463
       PART       0.88      0.88      0.88        58
       PRON       0.97      0.95      0.96       492
      PROPN       0.79      0.77      0.78      2330
      SCONJ       0.75      0.75      0.75       142
        SYM       0.73      0.27      0.39        30
       VERB       0.92      0.91      0.92      1182
          X       0.00      0.00      0.00         4

avg / total       0.85      0.85      0.85     10342



  'precision', 'predicted', average, warn_for)


In [12]:
import pickle
with open('xgb-pos.pkl','wb') as fopen:
    pickle.dump(model,fopen)
with open('xgb-bow-pos.pkl','wb') as fopen:
    pickle.dump(bow_chars,fopen)