# 질문 의도 파악 - 선형 모델

In [1]:
import numpy as np
import pandas as pd
import pickle
import re
import time
from konlpy.tag import Mecab

### 1. 데이터 탐색

In [2]:
with open("./questions/questions_pickle/X3_train_tfidf.pickle", "rb") as f:
    X_train = pickle.load(f)
with open("./questions/questions_pickle/X3_test_tfidf.pickle", "rb") as f:
    X_test = pickle.load(f)
with open("./questions/questions_pickle/tfidf_vocab_3.pickle", "rb") as f:
    tfidf_vocab = pickle.load(f)
with open("./questions/questions_pickle/tfidf_reversed_vocab_3.pickle", "rb") as f:
    tfidf_reversed_vocab = pickle.load(f)
with open("./questions/questions_pickle/y3_train.pickle", "rb") as f:
    y_train = pickle.load(f)
with open("./questions/questions_pickle/y3_test.pickle", "rb") as f:
    y_test = pickle.load(f)
with open("./questions/questions_pickle/mlb_3.pickle", "rb") as f:
    mlb = pickle.load(f)
with open("./questions/questions_pickle/tfidf_vectorizer_3.pickle", "rb") as f:
    tfidf_vectorizer = pickle.load(f)

In [3]:
X_train.shape, X_test.shape

((2707430, 117098), (300826, 117098))

In [4]:
y_train.shape, y_test.shape

((2707430, 15), (300826, 15))

In [5]:
mlb.classes_

array(['asian', 'atmosphere', 'chicken', 'chinese', 'etc', 'japanese',
       'korean', 'prefer', 'price', 'questionLOC', 'questionREV',
       'recommendation', 'sanitation', 'taste', 'western'], dtype=object)

In [6]:
tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=5, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(\\S+)', tokenizer=None,
                use_idf=True, vocabulary=None)

### 2. MultiLabel classifier

Classifier를 훈련시키는 함수 *train_classifier* 를 정의한다. 여기서는 *sklearn*의 One-vs-Rest approach 기법을 사용한다.<br>
[OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) 

이 방법으로 *k* (= number of tags) 개의 tag 중 하나로 분류해주는 classifier를 훈련시킬 수 있다. 

기본적인 분류 방법으로는 [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) 을 사용한다. 

가장 간단한 방법이지만, 텍스트 분류 작업에서 좋은 성능을 낸다. 분류할 tag가 많을 수록 시간이 많이 걸린다.

In [7]:
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [8]:
def train_classifier(X_train, y_train):
    """
      X_train, y_train — train 데이터
      return: 훈련된 classifier
    """

    return OneVsRestClassifier(LogisticRegression(penalty= 'l2' #L2 규제 사용
                                                , C = 10
                                                , max_iter = 1000)).fit(X_train, y_train)

In [9]:
%%time
classifier_tfidf = train_classifier(X_train, y_train)



CPU times: user 16min 5s, sys: 17.9 s, total: 16min 23s
Wall time: 4min 46s


In [11]:
classifier_tfidf

OneVsRestClassifier(estimator=LogisticRegression(C=10, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [12]:
# pickle로 저장
import pickle

with open('classifier_3.pickle', 'wb') as f:
    pickle.dump(classifier_tfidf, f, pickle.HIGHEST_PROTOCOL)

이제 test 데이터의 태그를 예측할 수 있다: labels, scores

In [13]:
y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)

In [14]:
# decision function(X_test)를 실행하면 X_test와 y_test 각 태그의 거리값을 나타낸다.
y_test_predicted_scores[:5]

array([[-11.51632184,  -5.72335057, -10.41849055, -10.12490374,
        -10.23538811,  -9.86654374,   7.95745669,  -5.79950487,
         -8.48318906, -12.94223794, -15.27791634,  13.96796603,
         -9.1088145 ,  -4.82551007,  -9.8319475 ],
       [ -8.4446771 ,   7.55543413, -10.30753162,  -9.24274293,
         -9.37429289,  -9.56670793,  -2.35246915,  -9.17311428,
        -10.09039386, -12.48855505, -14.86983245,  12.68289241,
        -10.72467441,  -8.92971706,  -8.49515868],
       [ -8.46001039,  -7.48120195, -10.79175642,  -9.83629896,
         -9.6278331 ,  -9.92713223,  -4.56343097,  -9.1604818 ,
          4.67823663,  -7.7524683 ,  -8.68709698,   7.2785256 ,
        -10.3823445 ,  -6.56162168,  -8.69648641],
       [ -6.95496032,  10.8867102 ,  -9.97197265,  -9.1532826 ,
          5.28878375,  -7.51167986,  -7.53973788, -11.6805577 ,
        -10.38519662,  -9.39107002, -10.74567851,   9.74876102,
        -10.53339617,  -9.08288785,  -8.26387002],
       [ -9.83396312,  -9.29

In [17]:
# tf-idf로 예측한 tag의 벡터화표현 
y_test_predicted_labels[:5]

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])

In [18]:
# 벡터화된 태그를 다시 텍스트 데이터로 변환시킨다.
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)  # 예측 태그
y_test_inversed = lb.inverse_transform(y_test) # 실제 태그
y_test_pred_inversed, y_test_inversed

([('korean', 'recommendation'),
  ('atmosphere', 'recommendation'),
  ('price', 'recommendation'),
  ('atmosphere', 'etc', 'recommendation'),
  ('questionLOC',),
  ('questionREV',),
  ('questionREV',),
  ('korean', 'recommendation'),
  ('questionREV',),
  ('questionLOC',),
  ('questionREV',),
  ('questionLOC',),
  ('questionREV',),
  ('questionLOC',),
  ('questionREV',),
  ('questionLOC',),
  ('questionLOC',),
  ('questionREV',),
  ('questionLOC',),
  ('questionREV',),
  ('korean', 'recommendation'),
  ('questionREV',),
  ('questionLOC',),
  ('korean', 'prefer', 'recommendation'),
  ('recommendation', 'taste'),
  ('questionLOC',),
  ('questionREV',),
  ('questionREV',),
  ('questionLOC',),
  ('questionREV',),
  ('questionLOC',),
  ('questionREV',),
  ('questionLOC',),
  ('questionREV',),
  ('questionLOC',),
  ('questionLOC',),
  ('atmosphere', 'japanese', 'recommendation'),
  ('questionLOC',),
  ('korean', 'recommendation', 'taste'),
  ('atmosphere', 'recommendation', 'western'),
  ('q

### 3. Evaluation

모델 평가를 위해 사용할 classification metrics:
 - [Accuracy](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
 - [F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
 - [Area under ROC-curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)
 - [Area under precision-recall curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score) 
 
위 문서에서 micro/macro/weighted averaging 이 무엇인지 잘 살펴보자.

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

*print_evaluation_scores* 를 정의하여 정확도를 평가해본다.
 - *accuracy*
 - *F1-score macro/micro/weighted*
 - *Precision macro/micro/weighted*

In [20]:
def print_evaluation_scores(y_test, predicted):
    print('accuracy_score :', accuracy_score(y_test, predicted))

    f1_macro = f1_score(y_test, predicted, average='macro')
    f1_micro = f1_score(y_test, predicted, average='micro')
    f1_weighted = f1_score(y_test, predicted, average='weighted')

    print('F1-score macro :', f1_macro)
    print('F1-score micro :', f1_micro)
    print('F1-score weighted :', f1_weighted)

    print('Precision macro :', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro :', average_precision_score(y_test, predicted, average='micro'))
    print('Precision weighted :', average_precision_score(y_test, predicted, average='weighted'))

In [21]:
print('Tfidf')
print_evaluation_scores(y_test, y_test_predicted_labels)

Tfidf
accuracy_score : 0.9961206810581532
F1-score macro : 0.9961604780983542
F1-score micro : 0.9987012625869901
F1-score weighted : 0.9986890121134869
Precision macro : 0.9927970757942873
Precision micro : 0.9976448588309706
Precision weighted : 0.9977954461537799


정확도가 99%이다.

##### LinearSVC 모델
- C값 변화 : 0.1 / 1 / 10 / 100 

In [22]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 0.1)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)

print_evaluation_scores(y_test, y_test_predicted_labels)

accuracy_score : 0.9955555703296922
F1-score macro : 0.9958888422691324
F1-score micro : 0.9984850653811178
F1-score weighted : 0.9984724200837146
Precision macro : 0.9922722585003177
Precision micro : 0.9972300327553372
Precision weighted : 0.9974015314786069


In [23]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 1)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)
    
print_evaluation_scores(y_test, y_test_predicted_labels)

accuracy_score : 0.9962137581193115
F1-score macro : 0.9963016740723758
F1-score micro : 0.9987434934589565
F1-score weighted : 0.9987311871148653
Precision macro : 0.9930741972663554
Precision micro : 0.9977243315440898
Precision weighted : 0.99787188851028


In [32]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 10)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)
    
print_evaluation_scores(y_test, y_test_predicted_labels)

accuracy_score : 0.9962868900959359
F1-score macro : 0.9963400164462496
F1-score micro : 0.9987454835347289
F1-score weighted : 0.9987331570526927
Precision macro : 0.9931567006792663
Precision micro : 0.9977341746535304
Precision weighted : 0.9978854720987266


### 4. 직접 만든 질문 테스트

4-1) 질문, 답 데이터 만들기

In [27]:
X_chatbot = [
    '맛있는 파스타집 추천해주세요'
    , '종로에 분위기 좋은 순대국밥집 있나요?'
    , '청결한 한식집 있어요?'
    , '가성비 좋은 짬뽕 집 알려주세요'
    , '줄 서서 기다리는 마라탕 집 있을까요'
    , '보쌈집 위치 알려주십시오'
    , '언니네횟집 주소좀요'
    , '언니네횟집 후기 좀 알려주세요'
    , '언니네횟집 사람들 반응점'
]
y_chatbot = 
    ['recommendation','taste','western']
    ,['recommendation','atmosphere','korean']
    ,['recommendation','sanitation','korean']
    ,['recommendation','price','chinese']
    ,['recommendation','prefer','chinese']
    ,['questionLOC']
    ,['questionLOC']
    ,['questionREV']
    ,['questionREV']
]

4-2) 필요 함수 정의

In [28]:
QUESTION_RE = re.compile('[^ ㄱ-ㅣ가-힣a-zA-Z]+')

STOPWORDS = set(['은','는','이','가','하','아','것','들','의','그','수','한','나','같','그렇'
                ,'문제','그리고','크','중','나오','지금','생각하','집','어떤','명','생각','이런'
                ,'인','지','을','를','에','스러운','스러워','주','할','만','게','도','져','된','로','고','던','로운','면서'
                ,'사실','이렇','점','싶','말','좀','식당','가게','집','음식점'
                ,'는지','나요','해요','해','는가요','삼','게요','예','는가','습니까','죠','려고요','는지요','서요','였어요','겠'
                ,'인가요','요' '라는','데','해서','세요','어요','을까요','건가요','겠죠','실래요','네요','으세요','지요','인데요'
                ,'드려요','려구요','합니다'])

def text_prepare(text):
    
    # 한글과 스페이스바를 제외한 다른 문자들은 모두 지운다.
    text = QUESTION_RE.sub('', text) 
    
    # Mecab 토크나이저
    mecab = Mecab()
    
    # mecab으로 text를 형태소 단위로 나누어 morphs_tokens 리스트에 저장한다.
    # 불용어를 지운다.
    text = ' '.join(token for token in mecab.morphs(text) if token not in STOPWORDS)

    return text

In [30]:
%%time
# SENTENCE 데이터 전처리
X_chatbot = [text_prepare(x) for x in X_chatbot]

# tf-idf 벡터화
X_chatbot_tfidf = tfidf_vectorizer.transform(X_chatbot)

# tag 데이터 전처리
y_chatbot = mlb.transform(y_chatbot)

# tag 예측
y_chatbot_predicted_labels_tfidf = classifier_tfidf.predict(X_chatbot_tfidf)
y_chatbot_predicted_scores_tfidf = classifier_tfidf.decision_function(X_chatbot_tfidf)

# 예측된 tag를 텍스트로 변환
y_chatbot_pred_inversed = mlb.inverse_transform(y_chatbot_predicted_labels_tfidf)
y_chatbot_inversed = mlb.inverse_transform(y_chatbot)

CPU times: user 10.1 ms, sys: 24.5 ms, total: 34.7 ms
Wall time: 97.1 ms


In [31]:
for i in range(0,len(y_chatbot)):
    print('SENTENCE:\t{}\n정답:\t{}\n예측:\t{}\n\n'.format(
        X_chatbot[i],
        y_chatbot_inversed[i],
        y_chatbot_pred_inversed[i]
    ))

SENTENCE:	맛있 파스타 추천
정답:	('recommendation', 'taste', 'western')
예측:	('recommendation', 'taste', 'western')


SENTENCE:	종로 분위기 좋 순대 국밥 있
정답:	('atmosphere', 'korean', 'recommendation')
예측:	('atmosphere', 'korean', 'recommendation')


SENTENCE:	청결 한식집 있
정답:	('korean', 'recommendation', 'sanitation')
예측:	('recommendation', 'sanitation')


SENTENCE:	가성 비 좋 짬뽕 알려
정답:	('chinese', 'price', 'recommendation')
예측:	('chinese', 'price', 'recommendation')


SENTENCE:	줄 서 서 기다리 마라 탕 있
정답:	('chinese', 'prefer', 'recommendation')
예측:	('chinese', 'prefer', 'recommendation')


SENTENCE:	보쌈 위치 알려 십시오
정답:	('questionLOC',)
예측:	('questionLOC',)


SENTENCE:	언니 네 횟집 주소 요
정답:	('questionLOC',)
예측:	('questionLOC',)


SENTENCE:	언니 네 횟집 후기 알려
정답:	('questionREV',)
예측:	('questionREV',)


SENTENCE:	언니 네 횟집 사람 반응점
정답:	('questionREV',)
예측:	('questionLOC', 'questionREV')




훈련 데이터와 비슷한 문장은 성능이 좋지만, 말투가 다르거나 훈련데이터에 없는 카테고리가 들어간 문장에 대해서는 정확도가 조금 떨어진다. 
훈련 데이터에 더욱 다양한 말투를 넣으면 정확도가 높아질 것으로 예상된다.