# 질문 의도 파악 - 선형 모델

In [1]:
import numpy as np
import pandas as pd
import pickle
import re
import time
from konlpy.tag import Mecab

### 1. 데이터 탐색

In [2]:
with open("./questions/questions_pickle/X1_train_tfidf.pickle", "rb") as f:
    X_train = pickle.load(f)
with open("./questions/questions_pickle/X1_test_tfidf.pickle", "rb") as f:
    X_test = pickle.load(f)
with open("./questions/questions_pickle/tfidf_vocab_1.pickle", "rb") as f:
    tfidf_vocab = pickle.load(f)
with open("./questions/questions_pickle/tfidf_reversed_vocab_1.pickle", "rb") as f:
    tfidf_reversed_vocab = pickle.load(f)
with open("./questions/questions_pickle/y1_train.pickle", "rb") as f:
    y_train = pickle.load(f)
with open("./questions/questions_pickle/y1_test.pickle", "rb") as f:
    y_test = pickle.load(f)
with open("./questions/questions_pickle/lb_1.pickle", "rb") as f:
    lb = pickle.load(f)
with open("./questions/questions_pickle/tfidf_vectorizer_1.pickle", "rb") as f:
    tfidf_vectorizer = pickle.load(f)

In [3]:
X_train.shape, X_test.shape

((2707430, 117126), (300826, 117126))

In [4]:
y_train.shape, y_test.shape

((2707430, 3), (300826, 3))

In [5]:
lb.classes_

array(['questionLOC', 'questionREV', 'recommendation'], dtype='<U14')

In [6]:
tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=5, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(\\S+)', tokenizer=None,
                use_idf=True, vocabulary=None)

### 2. MultiLabel classifier

Classifier를 훈련시키는 함수 *train_classifier* 를 정의한다. 여기서는 *sklearn*의 One-vs-Rest approach 기법을 사용한다.<br>
[OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) 

이 방법으로 *k* (= number of tags) 개의 tag 중 하나로 분류해주는 classifier를 훈련시킬 수 있다. 

기본적인 분류 방법으로는 [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) 을 사용한다. 

가장 간단한 방법이지만, 텍스트 분류 작업에서 좋은 성능을 낸다. 분류할 tag가 많을 수록 시간이 많이 걸린다.

In [7]:
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [8]:
def train_classifier(X_train, y_train):
    """
      X_train, y_train — train 데이터
      return: 훈련된 classifier
    """

    return OneVsRestClassifier(LogisticRegression(penalty= 'l2' #L2 규제 사용
                                                , C = 10
                                                , max_iter = 1000)).fit(X_train, y_train)

In [9]:
%%time
classifier_tfidf = train_classifier(X_train, y_train)



CPU times: user 2min 43s, sys: 3.46 s, total: 2min 47s
Wall time: 54.9 s


In [10]:
classifier_tfidf

OneVsRestClassifier(estimator=LogisticRegression(C=10, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

이제 test 데이터의 태그를 예측할 수 있다: labels, scores

In [11]:
y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)

In [12]:
# decision function(X_test)를 실행하면 X_test와 y_test 각 태그의 거리값을 나타낸다.
y_test_predicted_scores[:5]

array([[ 10.5572586 , -12.55584067, -10.72234289],
       [  6.76742896,  -8.09138899,  -7.77241751],
       [  9.41760449, -10.89320321,  -9.80423098],
       [ -6.64364537,  -5.49941548,   5.40844863],
       [  6.1515729 ,  -6.42303731,  -7.59892587]])

In [13]:
# tf-idf로 예측한 tag의 벡터화표현 ('questionLOC':[1,0,0], 'questionREV':[0,1,0], 'recommendation':[0,0,1] )
y_test_predicted_labels[:5]

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0]])

In [14]:
# 벡터화된 태그를 다시 텍스트 데이터로 변환시킨다.
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)  # 예측 태그
y_test_inversed = lb.inverse_transform(y_test) # 실제 태그
y_test_pred_inversed, y_test_inversed

(array(['questionLOC', 'questionLOC', 'questionLOC', ..., 'questionLOC',
        'questionLOC', 'questionLOC'], dtype='<U14'),
 array(['questionLOC', 'questionLOC', 'questionLOC', ..., 'questionLOC',
        'questionLOC', 'questionLOC'], dtype='<U14'))

### 3. Evaluation

모델 평가를 위해 사용할 classification metrics:
 - [Accuracy](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
 - [F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
 - [Area under ROC-curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)
 - [Area under precision-recall curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score) 
 
위 문서에서 micro/macro/weighted averaging 이 무엇인지 잘 살펴보자.

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

*print_evaluation_scores* 를 정의하여 정확도를 평가해본다.
 - *accuracy*
 - *F1-score macro/micro/weighted*
 - *Precision macro/micro/weighted*

In [20]:
def print_evaluation_scores(y_test, predicted):
    print('accuracy_score :', accuracy_score(y_test, predicted))

    f1_macro = f1_score(y_test, predicted, average='macro')
    f1_micro = f1_score(y_test, predicted, average='micro')
    f1_weighted = f1_score(y_test, predicted, average='weighted')

    print('F1-score macro :', f1_macro)
    print('F1-score micro :', f1_micro)
    print('F1-score weighted :', f1_weighted)

    print('Precision macro :', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro :', average_precision_score(y_test, predicted, average='micro'))
    print('Precision weighted :', average_precision_score(y_test, predicted, average='weighted'))

In [21]:
print('Tfidf')
print_evaluation_scores(y_test, y_test_predicted_labels)

Tfidf
accuracy_score : 0.999674230285946
F1-score macro : 0.9997388920427711
F1-score micro : 0.9997390643879397
F1-score weighted : 0.9997390738881349
Precision macro : 0.9995488113127032
Precision micro : 0.9995491150412575
Precision weighted : 0.9995491118697507


정확도가 99%이다.

##### LinearSVC 모델
- C값 변화 : 0.1 / 1 / 10 / 100 

In [26]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 0.1)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)

print_evaluation_scores(y_test, y_test_predicted_labels)

accuracy_score : 0.9994648068983399
F1-score macro : 0.9995809799626073
F1-score micro : 0.9995811866378594
F1-score weighted : 0.9995812384795967
Precision macro : 0.9992753634513681
Precision micro : 0.9992755771846008
Precision weighted : 0.9992757490024857


In [27]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 1)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)
    
print_evaluation_scores(y_test, y_test_predicted_labels)

accuracy_score : 0.99969417537048
F1-score macro : 0.9997571808212807
F1-score micro : 0.9997573452910029
F1-score weighted : 0.9997573521248041
Precision macro : 0.9995809399280239
Precision micro : 0.9995812349448392
Precision weighted : 0.9995812263886947


In [28]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 10)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)
    
print_evaluation_scores(y_test, y_test_predicted_labels)

accuracy_score : 0.9997274171780365
F1-score macro : 0.9997521655044327
F1-score micro : 0.9997523489453222
F1-score weighted : 0.9997523472986352
Precision macro : 0.9995864270695255
Precision micro : 0.9995867556830901
Precision weighted : 0.9995867484146275


In [29]:
from sklearn.svm import LinearSVC
classifier_tfidf = OneVsRestClassifier(LinearSVC(C = 100)).fit(X_train, y_train) # C값 변화시켜본다.

y_test_predicted_labels = classifier_tfidf.predict(X_test)
y_test_predicted_scores = classifier_tfidf.decision_function(X_test)
y_test_pred_inversed = lb.inverse_transform(y_test_predicted_labels)
y_test_inversed = lb.inverse_transform(y_test)
    
print_evaluation_scores(y_test, y_test_predicted_labels)



accuracy_score : 0.9996310159361226
F1-score macro : 0.9996856543629943
F1-score micro : 0.9996858696176085
F1-score weighted : 0.9996858678492753
Precision macro : 0.9994711913808424
Precision micro : 0.9994715635594027
Precision weighted : 0.9994715672966115


### 4. 직접 만든 질문 테스트

4-1) 질문, 답 데이터 만들기

In [36]:
X_chatbot = [
    '맛있는 파스타집 추천해주세요'
    , '종로에 분위기 좋은 순대국밥집 있나요?'
    , '서울에 데이트하기 좋은 파스타집 추천해주세요.'
    , '그 순대국밥 집 주소가 어떻게 되나요?'
    , '그 파스타집 위치 좀 알려주세요.'
    , '그 순대국밥 집 어때요?'
    , '원할머니보쌈 사람들 후기가 어떤가요'
]
y_chatbot = [
    'recommendation','recommendation','recommendation'
    ,'questionLOC','questionLOC','questionREV', 'questionREV'
]

4-2) 필요 함수 정의

In [31]:
QUESTION_RE = re.compile('[^ ㄱ-ㅣ가-힣a-zA-Z]+')

STOPWORDS = set(['은','는','이','가','하','아','것','들','의','그','수','한','나','같','그렇'
                ,'문제','그리고','크','중','나오','지금','생각하','집','어떤','명','생각','이런'
                ,'인','지','을','를','에','스러운','스러워','주','할','만','게','도','져','된','로','고','던','로운','면서'
                ,'사실','이렇','점','싶','말','좀','식당','가게','집','음식점'
                ,'는지','나요','해요','해','는가요','삼','게요','예','는가','습니까','죠','려고요','는지요','서요','였어요','겠'
                ,'인가요','요' '라는','데','해서','세요','어요','을까요','건가요','겠죠','실래요','네요','으세요','지요','인데요'
                ,'드려요','려구요','합니다'])

def text_prepare(text):
    
    # 한글과 스페이스바를 제외한 다른 문자들은 모두 지운다.
    text = QUESTION_RE.sub('', text) 
    
    # Mecab 토크나이저
    mecab = Mecab()
    
    # mecab으로 text를 형태소 단위로 나누어 morphs_tokens 리스트에 저장한다.
    # 불용어를 지운다.
    text = ' '.join(token for token in mecab.morphs(text) if token not in STOPWORDS)

    return text

In [41]:
# SENTENCE 데이터 전처리
X_chatbot = [text_prepare(x) for x in X_chatbot]

# tf-idf 벡터화
X_chatbot_tfidf = tfidf_vectorizer.transform(X_chatbot)

# tag 데이터 전처리
y_chatbot = lb.transform(y_chatbot)

# tag 예측
y_chatbot_predicted_labels_tfidf = classifier_tfidf.predict(X_chatbot_tfidf)
y_chatbot_predicted_scores_tfidf = classifier_tfidf.decision_function(X_chatbot_tfidf)

# 예측된 tag를 텍스트로 변환
y_chatbot_pred_inversed = lb.inverse_transform(y_chatbot_predicted_labels_tfidf)
y_chatbot_inversed = lb.inverse_transform(y_chatbot)

In [40]:
for i in range(0,len(y_chatbot)):
    print('SENTENCE:\t{}\n정답:\t{}\n예측:\t{}\n\n'.format(
        X_chatbot[i],
        y_chatbot_inversed[i],
        y_chatbot_pred_inversed[i]
    ))

SENTENCE:	맛있 파스타 추천
정답:	recommendation
예측:	recommendation


SENTENCE:	종로 분위기 좋 순대 국밥 있
정답:	recommendation
예측:	recommendation


SENTENCE:	서울 트 기 좋 파스타 추천
정답:	recommendation
예측:	recommendation


SENTENCE:	순대 국밥 주소 어떻게 되
정답:	questionLOC
예측:	questionLOC


SENTENCE:	파스타 위치 알려
정답:	questionLOC
예측:	questionLOC


SENTENCE:	순대 국밥 어때요
정답:	questionREV
예측:	questionREV


SENTENCE:	원 할머니 보쌈 사람 후기 가요
정답:	questionREV
예측:	questionREV




좋은 성능이다!

**pickle 저장**

In [11]:
import pickle

with open('./questions_model/models/classifier_1.pickle', "wb") as f:
    pickle.dump(classifier_tfidf, f, pickle.HIGHEST_PROTOCOL)
with open('./questions_model/models/lb_1.pickle', "wb") as f:
    pickle.dump(lb, f, pickle.HIGHEST_PROTOCOL)
with open('./questions_model/models/tfidf_vectorizer_1.pickle', "wb") as f:
    pickle.dump(tfidf_vectorizer, f, pickle.HIGHEST_PROTOCOL)