# 질문 의도 파악 - 선형 모델

In [1]:
import numpy as np
import pandas as pd
import pickle
import re
import time
from konlpy.tag import Mecab

### 1. 데이터 탐색

In [2]:
with open("./questions/questions_pickle/X2_cate_train_tfidf.pickle", "rb") as f:
    X_train = pickle.load(f)
with open("./questions/questions_pickle/X2_cate_test_tfidf.pickle", "rb") as f:
    X_test = pickle.load(f)
with open("./questions/questions_pickle/tfidf_vocab_2_cate.pickle", "rb") as f:
    tfidf_vocab = pickle.load(f)
with open("./questions/questions_pickle/tfidf_reversed_vocab_2_cate.pickle", "rb") as f:
    tfidf_reversed_vocab = pickle.load(f)
with open("./questions/questions_pickle/y2_cate_train.pickle", "rb") as f:
    y_train = pickle.load(f)
with open("./questions/questions_pickle/y2_cate_test.pickle", "rb") as f:
    y_test = pickle.load(f)
with open("./questions/questions_pickle/lb_2_cate.pickle", "rb") as f:
    lb = pickle.load(f)
with open("./questions/questions_pickle/tfidf_vectorizer_2_cate.pickle", "rb") as f:
    tfidf_vectorizer = pickle.load(f)

In [3]:
X_train.shape, X_test.shape

((818496, 30941), (90944, 30941))

In [4]:
y_train.shape, y_test.shape

((818496, 7), (90944, 7))

In [5]:
lb.classes_

array(['asian', 'chicken', 'chinese', 'etc', 'japanese', 'korean',
       'western'], dtype='<U8')

In [6]:
tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=5, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(\\S+)', tokenizer=None,
                use_idf=True, vocabulary=None)

### 2. MultiLabel classifier

Classifier를 훈련시키는 함수 *train_classifier* 를 정의한다. 여기서는 *sklearn*의 One-vs-Rest approach 기법을 사용한다.<br>
[OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) 

이 방법으로 *k* (= number of tags) 개의 tag 중 하나로 분류해주는 classifier를 훈련시킬 수 있다. 

기본적인 분류 방법으로는 [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) 을 사용한다. 

가장 간단한 방법이지만, 텍스트 분류 작업에서 좋은 성능을 낸다. 분류할 tag가 많을 수록 시간이 많이 걸린다.

In [7]:
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [8]:
def train_classifier(X_train, y_train):
    """
      X_train, y_train — train 데이터
      return: 훈련된 classifier
    """

    return OneVsRestClassifier(LogisticRegression(penalty= 'l2' #L2 규제 사용
                                                , C = 10
                                                , max_iter = 1000)).fit(X_train, y_train)

In [9]:
%%time
classifier_tfidf = train_classifier(X_train, y_train)



CPU times: user 1min 54s, sys: 2.93 s, total: 1min 57s
Wall time: 30.8 s


In [10]:
classifier_tfidf

OneVsRestClassifier(estimator=LogisticRegression(C=10, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

이제 test 데이터의 태그를 예측할 수 있다: labels, scores

In [11]:
y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)

In [12]:
# decision function(X_test)를 실행하면 X_test와 y_test 각 태그의 거리값을 나타낸다.
y_test_predicted_scores[:5]

array([[ -7.74311698,   4.37514697,  -8.31386427,  -8.49680788,
         -8.29589319,  -5.32758244,  -7.85518873],
       [-10.48062112, -10.94394173, -10.50351368, -10.54001805,
        -10.43394091,   9.81329751, -10.08922556],
       [ -8.00614206,  -9.77548383,  -8.94172441,  -8.79230965,
         -8.67783023,   7.25569696,  -8.27121428],
       [ -7.78141896, -10.07305292,  -9.53992256,  -9.41115876,
         -9.32496851,   7.26932614,  -8.94065276],
       [ -7.07798798,  -9.82664152,  -8.27874702,  -8.93233926,
         -8.91145552,   6.41676474,  -8.378813  ]])

In [13]:
# tf-idf로 예측한 tag의 벡터화표현 
y_test_predicted_labels[:5]

array([[0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0]])

In [14]:
# 벡터화된 태그를 다시 텍스트 데이터로 변환시킨다.
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)  # 예측 태그
y_test_inversed = lb.inverse_transform(y_test) # 실제 태그
y_test_pred_inversed, y_test_inversed

(array(['chicken', 'korean', 'korean', ..., 'chinese', 'korean', 'western'],
       dtype='<U8'),
 array(['chicken', 'korean', 'korean', ..., 'chinese', 'korean', 'western'],
       dtype='<U8'))

### 3. Evaluation

모델 평가를 위해 사용할 classification metrics:
 - [Accuracy](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
 - [F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
 - [Area under ROC-curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)
 - [Area under precision-recall curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score) 
 
위 문서에서 micro/macro/weighted averaging 이 무엇인지 잘 살펴보자.

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

*print_evaluation_scores* 를 정의하여 정확도를 평가해본다.
 - *accuracy*
 - *F1-score macro/micro/weighted*
 - *Precision macro/micro/weighted*

In [16]:
def print_evaluation_scores(y_test, predicted):
    print('accuracy_score :', accuracy_score(y_test, predicted))

    f1_macro = f1_score(y_test, predicted, average='macro')
    f1_micro = f1_score(y_test, predicted, average='micro')
    f1_weighted = f1_score(y_test, predicted, average='weighted')

    print('F1-score macro :', f1_macro)
    print('F1-score micro :', f1_micro)
    print('F1-score weighted :', f1_weighted)

    print('Precision macro :', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro :', average_precision_score(y_test, predicted, average='micro'))
    print('Precision weighted :', average_precision_score(y_test, predicted, average='weighted'))

In [17]:
print('Tfidf')
print_evaluation_scores(y_test, y_test_predicted_labels)

Tfidf
accuracy_score : 0.9979217980295566
F1-score macro : 0.9944122749664175
F1-score micro : 0.9980152512274105
F1-score weighted : 0.9979984928176973
Precision macro : 0.9893532680683448
Precision micro : 0.9963187611054002
Precision weighted : 0.9964631301702154


정확도가 99%이다.

##### LinearSVC 모델
- C값 변화 : 0.1 / 1 / 10 / 100 

In [18]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 0.1)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)

print_evaluation_scores(y_test, y_test_predicted_labels)

accuracy_score : 0.998042751583392
F1-score macro : 0.9945188497527134
F1-score micro : 0.9980922011160898
F1-score weighted : 0.998061221127528
Precision macro : 0.9896587373299272
Precision micro : 0.9964629366403088
Precision weighted : 0.9963079312747474


In [19]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 1)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)
    
print_evaluation_scores(y_test, y_test_predicted_labels)

accuracy_score : 0.9978668191414497
F1-score macro : 0.9943423075389798
F1-score micro : 0.9979767991643301
F1-score weighted : 0.9979590337570081
Precision macro : 0.9892254169348298
Precision micro : 0.9962451528342281
Precision weighted : 0.9963791337913465


In [20]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 10)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)
    
print_evaluation_scores(y_test, y_test_predicted_labels)

accuracy_score : 0.9979217980295566
F1-score macro : 0.9943898180236239
F1-score micro : 0.9979768214105073
F1-score weighted : 0.9979676099632657
Precision macro : 0.9892822937178768
Precision micro : 0.9962436267724283
Precision weighted : 0.9965381759535495


In [21]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 100)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)
    
print_evaluation_scores(y_test, y_test_predicted_labels)



accuracy_score : 0.997932793807178
F1-score macro : 0.9944768564365951
F1-score micro : 0.9980153821626288
F1-score weighted : 0.9980102240932788
Precision macro : 0.9894309779754914
Precision micro : 0.9963096011162498
Precision weighted : 0.99664838505653


### 4. 직접 만든 질문 테스트

4-1) 질문, 답 데이터 만들기

In [29]:
X_chatbot = [
    '맛있는 파스타집 추천해주세요'
    , '종로에 분위기 좋은 순대국밥집 있나요?'
    , '서울에 데이트하기 좋은 피자집 추천해주세요.'
    , '청결한 한식집 있어요?'
    , '가성비 좋은 짬뽕 집 알려주세요'
    , '줄 서서 기다리는 마라탕 집 있을까요'
    , '인기 많은 내장탕 집 알려주십시오'
    , '많은 사람들이 좋아하는 일식 횟집 알려주십시오'
    , '많은 사람들이 좋아하는 횟집 알려주십시오'
]
y_chatbot = [
    'western','korean','western'
    ,'korean','chinese','chinese'
    ,'korean','japanese','korean'
]

4-2) 필요 함수 정의

In [23]:
QUESTION_RE = re.compile('[^ ㄱ-ㅣ가-힣a-zA-Z]+')

STOPWORDS = set(['은','는','이','가','하','아','것','들','의','그','수','한','나','같','그렇'
                ,'문제','그리고','크','중','나오','지금','생각하','집','어떤','명','생각','이런'
                ,'인','지','을','를','에','스러운','스러워','주','할','만','게','도','져','된','로','고','던','로운','면서'
                ,'사실','이렇','점','싶','말','좀','식당','가게','집','음식점'
                ,'는지','나요','해요','해','는가요','삼','게요','예','는가','습니까','죠','려고요','는지요','서요','였어요','겠'
                ,'인가요','요' '라는','데','해서','세요','어요','을까요','건가요','겠죠','실래요','네요','으세요','지요','인데요'
                ,'드려요','려구요','합니다'])

def text_prepare(text):
    
    # 한글과 스페이스바를 제외한 다른 문자들은 모두 지운다.
    text = QUESTION_RE.sub('', text) 
    
    # Mecab 토크나이저
    mecab = Mecab()
    
    # mecab으로 text를 형태소 단위로 나누어 morphs_tokens 리스트에 저장한다.
    # 불용어를 지운다.
    text = ' '.join(token for token in mecab.morphs(text) if token not in STOPWORDS)

    return text

In [30]:
# SENTENCE 데이터 전처리
X_chatbot = [text_prepare(x) for x in X_chatbot]

# tf-idf 벡터화
X_chatbot_tfidf = tfidf_vectorizer.transform(X_chatbot)

# tag 데이터 전처리
y_chatbot = lb.transform(y_chatbot)

# tag 예측
y_chatbot_predicted_labels_tfidf = classifier_tfidf.predict(X_chatbot_tfidf)
y_chatbot_predicted_scores_tfidf = classifier_tfidf.decision_function(X_chatbot_tfidf)

# 예측된 tag를 텍스트로 변환
y_chatbot_pred_inversed = lb.inverse_transform(y_chatbot_predicted_labels_tfidf)
y_chatbot_inversed = lb.inverse_transform(y_chatbot)

In [31]:
for i in range(0,len(y_chatbot)):
    print('SENTENCE:\t{}\n정답:\t{}\n예측:\t{}\n\n'.format(
        X_chatbot[i],
        y_chatbot_inversed[i],
        y_chatbot_pred_inversed[i]
    ))

SENTENCE:	맛있 파스타 추천
정답:	western
예측:	western


SENTENCE:	종로 분위기 좋 순대 국밥 있
정답:	korean
예측:	korean


SENTENCE:	서울 데이트 기 좋 피자 추천
정답:	western
예측:	western


SENTENCE:	청결 한식집 있
정답:	korean
예측:	korean


SENTENCE:	가성 비 좋 짬뽕 알려
정답:	chinese
예측:	chinese


SENTENCE:	줄 서 서 기다리 마라 탕 있
정답:	chinese
예측:	chinese


SENTENCE:	인기 많 내 장탕 알려 십시오
정답:	korean
예측:	korean


SENTENCE:	많 사람 좋 일식 횟집 알려 십시오
정답:	japanese
예측:	japanese


SENTENCE:	많 사람 좋 횟집 알려 십시오
정답:	korean
예측:	korean




좋은 성능이다. 이번에는 훈련 데이터에 없던 메뉴를 질문 문장에 넣어서 예측도를 조사해본다.

In [32]:
X_chatbot = [
      '맛있는 훠궈 추천해주세요'
    , '종로에 분위기 좋은 갈비찜 가게 있나요?'
    , '서울에 데이트하기 좋은 타코집 추천해주세요.'
    , '청결한 낙곱새 식당 있어요?'
    , '가성비 좋은 도가니탕이 먹고 싶어요'
    , '줄 서서 기다리는 비빔국수 가게 있을까요'
]
y_chatbot = [
    'chinese','korean','western'
    ,'korean','korean','korean'
]

In [33]:
# SENTENCE 데이터 전처리
X_chatbot = [text_prepare(x) for x in X_chatbot]

# tf-idf 벡터화
X_chatbot_tfidf = tfidf_vectorizer.transform(X_chatbot)

# tag 데이터 전처리
y_chatbot = lb.transform(y_chatbot)

# tag 예측
y_chatbot_predicted_labels_tfidf = classifier_tfidf.predict(X_chatbot_tfidf)
y_chatbot_predicted_scores_tfidf = classifier_tfidf.decision_function(X_chatbot_tfidf)

# 예측된 tag를 텍스트로 변환
y_chatbot_pred_inversed = lb.inverse_transform(y_chatbot_predicted_labels_tfidf)
y_chatbot_inversed = lb.inverse_transform(y_chatbot)

for i in range(0,len(y_chatbot)):
    print('SENTENCE:\t{}\n정답:\t{}\n예측:\t{}\n\n'.format(
        X_chatbot[i],
        y_chatbot_inversed[i],
        y_chatbot_pred_inversed[i]
    ))

SENTENCE:	맛있 훠궈 추천
정답:	chinese
예측:	korean


SENTENCE:	종로 분위기 좋 갈비찜 있
정답:	korean
예측:	korean


SENTENCE:	서울 데이트 기 좋 타코 추천
정답:	western
예측:	korean


SENTENCE:	청결 낙 곱새 있
정답:	korean
예측:	korean


SENTENCE:	가성 비 좋 도가니 탕 먹
정답:	korean
예측:	asian


SENTENCE:	줄 서 서 기다리 비빔국수 있
정답:	korean
예측:	korean




IndexError: list index out of range

예상대로 훈련 데이터에 없던 메뉴는 정확도가 좋지 않다.

이를 해결하려면 Bi-LSTM을 이용한 Named Entity Recognition 모델이 필요하다.