In [1]:
%%html
<style>
table {float:left}
</style>

## Tf-Idf & Multinomial NB using `soynlp` & `soyspacing`

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from collections import Counter

from words_preprocessing import *
from file_io import *

## 1. `soynlp`로 토크나이징한 데이터를 이용한 학습
### 기존 모델(`twitter`, 교정X)의 성능

In [None]:
# [[16247   984   280   768   177]
#  [ 1238  7903   553   230   171]
#  [  579   966  3319   145    49]
#  [ 1334   502   116  3178    91]
#  [  474   425   100   138  1486]]

In [None]:
#               precision    recall  f1-score   support

#            0       0.82      0.88      0.85     18456
#            1       0.73      0.78      0.76     10095
#            2       0.76      0.66      0.70      5058
#            3       0.71      0.61      0.66      5221
#            4       0.75      0.57      0.65      2623

#  avg / total       0.77      0.78      0.77     41453

In [3]:
train = load_pickle('../train_space_tokenized.pkl')
test = load_pickle('../test_space_tokenized.pkl')
len(train), len(test)

(149230, 16580)

In [4]:
y0 = [row[1] for row in train]
x0 = [' '.join(row[0]) for row in train]

X_train, X_test, y_train, y_test = train_test_split(x0, y0, 
                                                    random_state=1234)

len(X_train), len(X_test), len(y_train), len(y_test)

(111922, 37308, 111922, 37308)

In [5]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=10, ngram_range=(1, 3))), 
    ('clf', MultinomialNB(alpha=0.001)),    
])

model = clf.fit(X_train, y_train)
model

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...e,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])

In [6]:
%%time
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross validation score: {}".format(scores))

Cross validation score: [ 0.73827392  0.74272951  0.73825054  0.74236061  0.74583389]
CPU times: user 45 s, sys: 1.87 s, total: 46.9 s
Wall time: 47 s


In [7]:
y_pred = model.predict(X_test)

In [8]:
print(confusion_matrix(y_test, y_pred))

[[14885   862   186   441    74]
 [ 1583  6991   368   153    95]
 [  759  1107  2571   106    24]
 [ 1709   504   111  2301    51]
 [  699   519    93   113  1003]]


In [9]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.76      0.90      0.83     16448
          1       0.70      0.76      0.73      9190
          2       0.77      0.56      0.65      4567
          3       0.74      0.49      0.59      4676
          4       0.80      0.41      0.55      2427

avg / total       0.75      0.74      0.73     37308



In [11]:
test_label = [row[1] for row in test]
test_data = [' '.join(row[0]) for row in test]

pred = model.predict(test_data)

In [12]:
print(confusion_matrix(test_label, pred))

[[6678  375   86  190   32]
 [ 669 3058  185   92   46]
 [ 321  517 1134   52   15]
 [ 755  225   45 1024   21]
 [ 325  216   37   47  435]]


In [13]:
print(classification_report(test_label, pred))

             precision    recall  f1-score   support

          0       0.76      0.91      0.83      7361
          1       0.70      0.76      0.72      4050
          2       0.76      0.56      0.64      2039
          3       0.73      0.49      0.59      2070
          4       0.79      0.41      0.54      1060

avg / total       0.74      0.74      0.73     16580



### 결과

- 더 안 좋은 성능
- precision은 상승했으나 recall은 많이 악화됨

## 2. `soyspacing`으로 띄어쓰기 교정한 데이터를 이용한 학습

In [10]:
train_spacing = load_pickle('../train_space_corrected.pkl')
test_spacing = load_pickle('../test_space_corrected.pkl')

In [15]:
%%time
train_docs = [(tokenize(row[0]), row[1]) for row in train_spacing]
test_docs = [(tokenize(row[0]), row[1]) for row in test_spacing]

CPU times: user 2min 47s, sys: 644 ms, total: 2min 48s
Wall time: 2min 45s


In [16]:
y0 = [row[1] for row in train_docs]
x0 = [' '.join(row[0]) for row in train_docs]

X_train, X_test, y_train, y_test = train_test_split(x0, y0, 
                                                    random_state=1234)

len(X_train), len(X_test), len(y_train), len(y_test)

(111922, 37308, 111922, 37308)

In [17]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=10, ngram_range=(1, 3))), 
    ('clf', MultinomialNB(alpha=0.001)),    
])

model = clf.fit(X_train, y_train)
model

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...e,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])

In [18]:
%%time
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross validation score: {}".format(scores))

Cross validation score: [ 0.77387653  0.77109672  0.77202466  0.77533059  0.77532056]
CPU times: user 47.9 s, sys: 1.33 s, total: 49.2 s
Wall time: 49.2 s


In [19]:
y_pred = model.predict(X_test)

In [20]:
print(confusion_matrix(y_test, y_pred))

[[14531   849   253   648   167]
 [ 1239  7167   428   209   147]
 [  511   926  2968   119    43]
 [ 1230   444    96  2847    59]
 [  446   417    97   118  1349]]


In [21]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.81      0.88      0.84     16448
          1       0.73      0.78      0.75      9190
          2       0.77      0.65      0.71      4567
          3       0.72      0.61      0.66      4676
          4       0.76      0.56      0.64      2427

avg / total       0.77      0.77      0.77     37308



### 결과

- 큰 차이는 없으나 기존 모델보다 향상되지는 않음
- 띄어쓰기 교정이 좀 더 제대로 이루어지기 위해서는 띄어쓰기 교정 학습 시 데이터가 더 많고, 더 정교한 처리가 필요할 것으로 보임

In [22]:
test_label = [row[1] for row in test_docs]
test_data = [' '.join(row[0]) for row in test_docs]

pred = model.predict(test_data)

In [23]:
print(confusion_matrix(test_label, pred))

[[6470  394  127  298   72]
 [ 538 3148  186  107   71]
 [ 240  418 1298   50   33]
 [ 547  202   61 1226   34]
 [ 183  168   32   51  626]]


In [24]:
print(classification_report(test_label, pred))

             precision    recall  f1-score   support

          0       0.81      0.88      0.84      7361
          1       0.73      0.78      0.75      4050
          2       0.76      0.64      0.69      2039
          3       0.71      0.59      0.64      2070
          4       0.75      0.59      0.66      1060

avg / total       0.77      0.77      0.77     16580

