In [1]:
%%html
<style>
table {float:left}
</style>

## Tf-Idf & Multinomial NB using `soynlp` & `soyspacing`

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from collections import Counter

from words_preprocessing import *
from file_io import *

## 1. `soynlp`로 토크나이징한 데이터를 이용한 학습
### 기존 모델(`twitter`, 교정X)의 성능

In [3]:
train = load_pickle('../train_space_corrected.pickle')
train_tokenized = load_pickle('../train_space_tokenized.pickle')
len(train), len(train_tokenized)

(165810, 165810)

In [4]:
y0 = [row[1] for row in train_tokenized]
x0 = [' '.join(row[0]) for row in train_tokenized]

X_train, X_test, y_train, y_test = train_test_split(x0, y0, 
                                                    random_state=1234)

len(X_train), len(X_test), len(y_train), len(y_test)

(124357, 41453, 124357, 41453)

In [5]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=10, ngram_range=(1, 3))), 
    ('clf', MultinomialNB(alpha=0.001)),    
])

model = clf.fit(X_train, y_train)
model

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...e,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])

In [6]:
%%time
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross validation score: {}".format(scores))

Cross validation score: [ 0.74575862  0.74095368  0.7421897   0.74326498  0.74346602]
CPU times: user 50.9 s, sys: 1.92 s, total: 52.8 s
Wall time: 52.8 s


In [7]:
y_pred = model.predict(X_test)

In [8]:
print(confusion_matrix(y_test, y_pred))

[[16729   929   220   491    87]
 [ 1648  7713   451   179   104]
 [  866  1212  2824   117    39]
 [ 1807   632   117  2601    64]
 [  794   551    82   114  1082]]


In [9]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.77      0.91      0.83     18456
          1       0.70      0.76      0.73     10095
          2       0.76      0.56      0.65      5058
          3       0.74      0.50      0.60      5221
          4       0.79      0.41      0.54      2623

avg / total       0.75      0.75      0.74     41453



### 결과

- 더 안 좋은 성능
- precision은 상승했으나 recall은 많이 악화됨

## 2. `soyspacing`으로 띄어쓰기 교정한 데이터를 이용한 학습

In [10]:
%%time
train_docs = [(tokenize(row[0]), row[1]) for row in train]

CPU times: user 3min, sys: 884 ms, total: 3min 1s
Wall time: 2min 47s


In [11]:
y0 = [row[1] for row in train_docs]
x0 = [' '.join(row[0]) for row in train_docs]

X_train, X_test, y_train, y_test = train_test_split(x0, y0, 
                                                    random_state=1234)

len(X_train), len(X_test), len(y_train), len(y_test)

(124357, 41453, 124357, 41453)

In [12]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=10, ngram_range=(1, 3))), 
    ('clf', MultinomialNB(alpha=0.001)),    
])

model = clf.fit(X_train, y_train)
model

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...e,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])

In [13]:
%%time
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross validation score: {}".format(scores))

Cross validation score: [ 0.77824234  0.76865552  0.77576294  0.77651789  0.76928026]
CPU times: user 54 s, sys: 1.86 s, total: 55.9 s
Wall time: 56 s


In [14]:
y_pred = model.predict(X_test)

In [15]:
print(confusion_matrix(y_test, y_pred))

[[16259   983   277   759   178]
 [ 1230  7912   554   229   170]
 [  579   957  3333   141    48]
 [ 1339   506   118  3168    90]
 [  483   426   100   140  1474]]


In [16]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.82      0.88      0.85     18456
          1       0.73      0.78      0.76     10095
          2       0.76      0.66      0.71      5058
          3       0.71      0.61      0.66      5221
          4       0.75      0.56      0.64      2623

avg / total       0.77      0.78      0.77     41453



### 결과

- 큰 차이는 없으나 기존 모델보다 향상되지는 않음
- 띄어쓰기 교정이 좀 더 제대로 이루어지기 위해서는 띄어쓰기 교정 학습 시 데이터가 더 많고, 더 정교한 처리가 필요할 것으로 보임