In [1]:
%%html
<style>
table {float:left}
</style>

## Tf-Idf & Multinomial NB (scikit-learn)

- tokenizing/POS tagging --> tf-idf vectorizing
- model: Multinomial Naive Bayes
- train/test: 148861/16539 (라벨링 이후 필터링)

### Modeling

- Without Oversampling
- With Oversampling

### Results

|                      | Accuracy | Recall | F1-Score |
|----------------------|:--------:|:------:|:--------:|
| Without Oversampling |   0.77   |  0.77  |   0.77   |
| With Obersampling    |   0.86   |  0.86  |   0.86   |

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from utils import *
from pprint import pprint
from collections import Counter

In [3]:
train_joy = read_data('../train_data_labeled_joy.txt')
train_anger = read_data('../train_data_labeled_anger.txt')
train_disgust = read_data('../train_data_labeled_disgust.txt')
train_sadness = read_data('../train_data_labeled_sadness.txt')
train_fear = read_data('../train_data_labeled_fear.txt')

trains = [train_joy, train_anger, train_disgust, train_sadness, train_fear]
labels = ['기쁘다', '화나다', '역겹다', '슬프다', '무섭다']

num_dic = {}
for label, data in zip(labels, trains):
    num_dic[label] = len(data)
label_count = Counter(num_dic)
print(label_count)
print()

sum_ = sum(label_count.values())
for label in label_count.keys():
    print(label ,":", round(label_count[label] / sum_, 2))

Counter({'기쁘다': 73550, '화나다': 40242, '슬프다': 20701, '역겹다': 20325, '무섭다': 10582})

기쁘다 : 0.44
화나다 : 0.24
역겹다 : 0.12
슬프다 : 0.13
무섭다 : 0.06


- '기쁘다'의 데이터양은 44%인데 비해 '무섭다'는 6% --> 오버샘플링 시도

In [4]:
import random
def pop_test(data, length):
    result = [data.pop(random.randrange(len(data))) for _ in range(length)]
    return result

In [5]:
test_joy = pop_test(train_joy, len(train_joy)//10)
test_anger = pop_test(train_anger, len(train_anger)//10)
test_disgust = pop_test(train_disgust, len(train_disgust)//10)
test_sadness = pop_test(train_sadness, len(train_sadness)//10)
test_fear = pop_test(train_fear, len(train_fear)//10)

In [6]:
train_joy_labeled = [(row, 0) for row in train_joy]
train_anger_labeled = [(row, 1) for row in train_anger]
train_disgust_labeled = [(row, 2) for row in train_disgust]
train_sadness_labeled = [(row, 3) for row in train_sadness]
train_fear_labeled = [(row, 4) for row in train_fear]

In [7]:
test_joy_labeled = [(row, 0) for row in test_joy]
test_anger_labeled = [(row, 1) for row in test_anger]
test_disgust_labeled = [(row, 2) for row in test_disgust]
test_sadness_labeled = [(row, 3) for row in test_sadness]
test_fear_labeled = [(row, 4) for row in test_fear]

In [8]:
train_lst = [train_joy_labeled, train_anger_labeled, 
             train_disgust_labeled, train_sadness_labeled, train_fear_labeled]
test_lst = [test_joy_labeled, test_anger_labeled,
            test_disgust_labeled, test_sadness_labeled, test_fear_labeled]

In [9]:
train, test = [], []
for data in train_lst:
    train += data

for data in test_lst:
    test += data
    
print(len(train), len(test))

148861 16539


In [10]:
save_pickle('../train_labeled_0503.pickle', train)
save_pickle('../test_labeled_0503.pickle', test)

In [11]:
%%time
train_docs = [(tokenize(row[0][0]), row[1]) for row in train]
test_docs = [(tokenize(row[0][0]), row[1]) for row in test]

CPU times: user 3min 8s, sys: 1.13 s, total: 3min 10s
Wall time: 2min 58s


In [12]:
save_pickle('../train_docs_0503.pickle', train_docs)
save_pickle('../test_docs_0503.pickle', test_docs)

In [13]:
y0 = [row[1] for row in train_docs]
x0 = [' '.join(row[0]) for row in train_docs]

test_label = [row[1] for row in test_docs]
test_data = [' '.join(row[0]) for row in test_docs]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(x0, y0, 
                                                    random_state=1234)

len(X_train), len(X_test), len(y_train), len(y_test)

(111645, 37216, 111645, 37216)

In [15]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=10, ngram_range=(1, 3))), 
    ('clf', MultinomialNB(alpha=0.001)),    
])

model = clf.fit(X_train, y_train)
model

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...e,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])

In [16]:
%%time
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross validation score: {}".format(scores))

Cross validation score: [ 0.77251355  0.77277206  0.77622033  0.76920321  0.77453308]
CPU times: user 48.2 s, sys: 1.58 s, total: 49.8 s
Wall time: 49.8 s


In [17]:
y_pred = model.predict(X_test)

In [18]:
print(confusion_matrix(y_test, y_pred))

[[14577   861   248   713   158]
 [ 1119  7066   433   226   149]
 [  533   909  3015   158    52]
 [ 1176   450    98  2837    77]
 [  437   428    84    99  1313]]


In [19]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.82      0.88      0.85     16557
          1       0.73      0.79      0.76      8993
          2       0.78      0.65      0.71      4667
          3       0.70      0.61      0.65      4638
          4       0.75      0.56      0.64      2361

avg / total       0.77      0.77      0.77     37216



**05/03**
- '기쁘다' 필터링 이후, '슬프다' precision 약간 감소 (0.72 --> 0.70)
    - 긍/부정 동시 등장 제거
    - 반어법 제거
- '무섭다' precision 0.01 감소, recall 0.02 상승

### 학습에 사용되지 않은 데이터로 Prediction
- 단 데이터 자체는 train과 같이 가공되었음

In [20]:
pred = model.predict(test_data)

In [21]:
print(confusion_matrix(test_label, pred))

[[6497  379  105  294   80]
 [ 500 3151  208  103   62]
 [ 229  398 1312   71   22]
 [ 519  191   44 1282   34]
 [ 167  208   31   60  592]]


In [22]:
print(classification_report(test_label, pred))

             precision    recall  f1-score   support

          0       0.82      0.88      0.85      7355
          1       0.73      0.78      0.75      4024
          2       0.77      0.65      0.70      2032
          3       0.71      0.62      0.66      2070
          4       0.75      0.56      0.64      1058

avg / total       0.77      0.78      0.77     16539



### 오버샘플링 이후 학습

In [23]:
from imblearn.over_sampling import *

In [24]:
x = np.array(x0).reshape(-1, 1)
X, y = RandomOverSampler(random_state=1234).fit_sample(x, y0)
X = [x[0] for x in X.tolist()]

In [25]:
len(X), len(y)

(330975, 330975)

In [26]:
Counter(y)

Counter({0: 66195, 1: 66195, 2: 66195, 3: 66195, 4: 66195})

In [27]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, 
                                                        random_state=1234)

len(X_train2), len(X_test2), len(y_train2), len(y_test2)

(248231, 82744, 248231, 82744)

In [28]:
clf2 = Pipeline([
    ('vect', TfidfVectorizer(min_df=10, ngram_range=(1, 3))), 
    ('clf', MultinomialNB(alpha=0.001)),    
])

model2 = clf2.fit(X_train2, y_train2)
model2

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...e,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])

In [29]:
%%time
scores2 = cross_val_score(clf2, X_train2, y_train2, cv=5)
print("Cross validation score: {}".format(scores2))

Cross validation score: [ 0.84438447  0.84389792  0.84254839  0.84556661  0.84448966]
CPU times: user 1min 43s, sys: 3.23 s, total: 1min 46s
Wall time: 1min 46s


In [30]:
y_pred2 = model2.predict(X_test2)

In [31]:
print(confusion_matrix(y_test2, y_pred2))

[[12677  1062   620  1598   470]
 [ 1137 13315  1051   530   463]
 [  486   895 14665   300   185]
 [  958   681   256 14496   238]
 [  289   313   133   195 15731]]


In [32]:
print(classification_report(y_test2, y_pred2))

             precision    recall  f1-score   support

          0       0.82      0.77      0.79     16427
          1       0.82      0.81      0.81     16496
          2       0.88      0.89      0.88     16531
          3       0.85      0.87      0.86     16629
          4       0.92      0.94      0.93     16661

avg / total       0.86      0.86      0.86     82744



- 오버샘플링 이후 recall 상승
- precision/recall 크게 상승 (특히 역겹다/슬프다/무섭다)

In [36]:
pred2 = model2.predict(test_data)

In [37]:
print(confusion_matrix(test_label, pred2))

[[5650  496  304  662  243]
 [ 381 2916  390  186  151]
 [ 142  322 1411  100   57]
 [ 313  162   74 1436   85]
 [ 107  158   57   66  670]]


In [38]:
print(classification_report(test_label, pred2))

             precision    recall  f1-score   support

          0       0.86      0.77      0.81      7355
          1       0.72      0.72      0.72      4024
          2       0.63      0.69      0.66      2032
          3       0.59      0.69      0.64      2070
          4       0.56      0.63      0.59      1058

avg / total       0.74      0.73      0.73     16539



In [39]:
save_pickle('../model/mnb_0503.pickle', model)
save_pickle('../model/mnb_0503_ov.pickle', model2)