## 네이버 무비 리뷰 분류 모형 

약 2만건의 네이버 무비 리뷰 데이터를 활용해 Sentiment Classification을 하는 모형을 만들어 본다. 

In [1]:
import pandas as pd
import numpy as np
from konlpy.tag import Mecab
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd 
import mxnet as mx
import multiprocessing as mp
import time
import itertools
from tqdm import tqdm


mecab = Mecab()


### Vocab 생성 

학습셋 전체의 문장을 이용해 전처리를 한 뒤, Vocab을 생성한다. `Mecab` 형태소 분석기로 형태소만으로 Vocab을 생성 

In [2]:
rating = pd.read_csv("ratings.txt",sep='\t')

In [3]:
rating.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [4]:
dataset = [(d, l) for d,l in zip(rating['document'], rating['label'])]

In [5]:
seq_len = 30

In [6]:
length_clip = nlp.data.PadSequence(seq_len, pad_val="<pad>")

#import re
#REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
#REPLACE_WITH_SPACE = re.compile('<')

def preprocess(data):
    comment, label = data
    # 형태소 기준으로 전처리
    morphs = mecab.morphs(str(comment).strip())
    return(length_clip(morphs), label)
    
    # 명사 기준으로 전처리 변경1 -> 74%
    # nouns = mecab.nouns(str(comment).strip()) 
    # return (length_clip(nouns), label)
    
    # 품사 기준으로 전처리 변경2 -> TypeError: '<' not supported between instances of 'str' and 'tuple' 
    #comment = [REPLACE_WITH_SPACE.sub(" ", line) for line in comment]
    #pos = mecab.pos(str(comment).strip()) 
    #return (length_clip(pos), label)


def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
        #lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'
          .format(end - start, len(dataset)))
    return dataset

In [7]:
preprocessed = preprocess_dataset(dataset)

Done! Tokenizing Time=6.70s, #Sentences=200000


첫번째 문장의 첫 11개 토큰 출력  

In [8]:
preprocessed[100][0][:11]

['대박', '이', '였', '지', '이건', '한마디', '로', '.', '.', '<pad>', '<pad>']

학습셋 전체로 토큰 빈도를 생성 `counter`를 만들고, `vocab`을 생성. 
문장 생성이나 seq2seq가 아니기 때문에 `bos_token`, `eos_token` 표현은 생략 

In [13]:
counter = nlp.data.count_tokens(itertools.chain.from_iterable([c for c, _ in preprocessed]))
#vocab = nlp.Vocab(counter,bos_token=None, eos_token=None, min_freq=15)
vocab = nlp.Vocab(counter,bos_token=None, eos_token=None, min_freq=10)

In [14]:
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko')

In [15]:
vocab.set_embedding(fasttext_simple)

### 학습셋 생성 

토큰을 `index`로 변환 하여 학습을 위한 데이터로 변환 

In [16]:
preprocessed_encoded  = [(vocab[data], label)  for data, label in preprocessed ]

In [17]:
train, test = nlp.data.train_valid_split(preprocessed_encoded, valid_ratio=0.1)

In [18]:
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Stack(),
                                      nlp.data.batchify.Stack('float32'))

train_dataloader  = gluon.data.DataLoader(train, batch_size=100, batchify_fn=batchify_fn, shuffle=True, last_batch='discard')
test_dataloader  = gluon.data.DataLoader(test, batch_size=100, batchify_fn=batchify_fn, shuffle=True, last_batch='discard')

### 모델 정의 

In [19]:
class SentClassificationModelAtt(gluon.HybridBlock):
    def __init__(self, vocab_size, num_embed, hidden_size, **kwargs):
        super(SentClassificationModelAtt, self).__init__(**kwargs)
        self.hidden_size = hidden_size
        with self.name_scope():
            self.embed = nn.Embedding(input_dim=vocab_size, output_dim=num_embed)
            self.drop = nn.Dropout(0.3)
            self.lstm = rnn.LSTM(self.hidden_size, dropout=0.2, bidirectional=True, layout="NTC")
            self.attention = nlp.model.MLPAttentionCell(30, dropout=0.2)
            #self.fc = nn.Dense(100)
            self.dense = nn.Dense(2)
            
    def hybrid_forward(self, F ,inputs):
        em_out = self.drop(self.embed(inputs))
        bilstm_out = self.lstm(em_out)
        ctx_vector, _ = self.attention(bilstm_out, bilstm_out)
        outs = self.dense(ctx_vector)
        return(outs)

In [21]:
ctx = mx.gpu()
#ctx = mx.cpu()

#모형 인스턴스 생성 및 트래이너, loss 정의 
model = SentClassificationModelAtt(vocab_size = len(vocab.idx_to_token), num_embed=vocab.embedding.idx_to_vec.shape[1],
                                  hidden_size=60)


In [22]:
model.initialize(mx.init.Xavier(),ctx=ctx)
#pre-trained 된 임베딩은 모델 초기화 이후 적용한다.
model.embed.weight.set_data(vocab.embedding.idx_to_vec.as_in_context(ctx))
model.hybridize()

In [24]:
trainer = gluon.Trainer(model.collect_params(), 'adam')
loss = gluon.loss.SoftmaxCrossEntropyLoss()

In [25]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iter):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = model(data)
        predictions = nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return(acc.get()[1])

In [26]:
def calculate_loss(model, data_iter, loss_obj, ctx=ctx):
    test_loss = []
    for i, (te_data, te_label) in enumerate(data_iter):
        te_data = te_data.as_in_context(ctx)
        te_label = te_label.as_in_context(ctx)
        te_output = model(te_data)
        loss_te = loss_obj(te_output, te_label)
        curr_loss = nd.mean(loss_te).asscalar()
        test_loss.append(curr_loss)
    return(np.mean(test_loss))

In [27]:
epochs = 10


tot_test_loss = []
tot_test_accu = []
tot_train_loss = []
for e in range(epochs):
    train_loss = []
    #batch training 
    for i, (data, label) in enumerate(tqdm(train_dataloader)):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        
        ###print(data.shape)
        with autograd.record():
            output = model(data)
            loss_ = loss(output, label)
            loss_.backward()
        trainer.step(data.shape[0])

        curr_loss = nd.mean(loss_).asscalar()
        train_loss.append(curr_loss)

    #caculate test loss
    test_loss = calculate_loss(model, test_dataloader, loss_obj = loss, ctx=ctx) 
    test_accu = evaluate_accuracy(model, test_dataloader,  ctx=ctx)

    print("Epoch %s. Train Loss: %s, Test Loss : %s, Test Accuracy : %s" % (e, np.mean(train_loss), test_loss, test_accu))    
    tot_test_loss.append(test_loss)
    tot_train_loss.append(np.mean(train_loss))
    tot_test_accu.append(test_accu)
    

100%|██████████| 1800/1800 [00:50<00:00, 35.99it/s]
  0%|          | 0/1800 [00:00<?, ?it/s]

Epoch 0. Train Loss: 0.3649794, Test Loss : 0.32480225, Test Accuracy : 0.8605


100%|██████████| 1800/1800 [00:49<00:00, 36.14it/s]
  0%|          | 5/1800 [00:00<00:37, 47.53it/s]

Epoch 1. Train Loss: 0.29139355, Test Loss : 0.30886558, Test Accuracy : 0.8652


100%|██████████| 1800/1800 [00:49<00:00, 36.47it/s]
  0%|          | 5/1800 [00:00<00:37, 47.61it/s]

Epoch 2. Train Loss: 0.25976864, Test Loss : 0.3031374, Test Accuracy : 0.87005


100%|██████████| 1800/1800 [00:33<00:00, 53.36it/s]
  0%|          | 6/1800 [00:00<00:30, 59.47it/s]

Epoch 3. Train Loss: 0.2343237, Test Loss : 0.32367703, Test Accuracy : 0.8704


100%|██████████| 1800/1800 [00:29<00:00, 61.41it/s]
  0%|          | 6/1800 [00:00<00:30, 59.75it/s]

Epoch 4. Train Loss: 0.20986067, Test Loss : 0.3312329, Test Accuracy : 0.8678


100%|██████████| 1800/1800 [00:29<00:00, 61.46it/s]
  0%|          | 6/1800 [00:00<00:30, 59.57it/s]

Epoch 5. Train Loss: 0.18674502, Test Loss : 0.34211674, Test Accuracy : 0.8672


100%|██████████| 1800/1800 [00:29<00:00, 61.41it/s]
  0%|          | 6/1800 [00:00<00:30, 59.24it/s]

Epoch 6. Train Loss: 0.16510928, Test Loss : 0.36338326, Test Accuracy : 0.86695


100%|██████████| 1800/1800 [00:29<00:00, 61.40it/s]
  0%|          | 6/1800 [00:00<00:30, 59.31it/s]

Epoch 7. Train Loss: 0.14514303, Test Loss : 0.4163906, Test Accuracy : 0.8622


100%|██████████| 1800/1800 [00:29<00:00, 61.16it/s]
  0%|          | 6/1800 [00:00<00:30, 59.31it/s]

Epoch 8. Train Loss: 0.1281185, Test Loss : 0.44837943, Test Accuracy : 0.86255


100%|██████████| 1800/1800 [00:29<00:00, 61.48it/s]


Epoch 9. Train Loss: 0.112834066, Test Loss : 0.51143116, Test Accuracy : 0.8595


## TODO 

- 테스트 정확도를 87% 이상 올려본다.(Optimizer, RNN, Convolution, 데이터 전처리 방식 변경(명사만 사용?), ...) 
- 학습된 임베딩 레이어를 기반으로 단어간의 유사도를 구해본다. 
- 토큰이 아닌 char 기반으로 학습하면 어떨까? 성능이 좋아지나? 


## 명사 기준으로 전처리 변경1 -> 74%
    # nouns = mecab.nouns(str(comment).strip()) 
    # return (length_clip(nouns), label)
    
    # 품사 기준으로 전처리 변경2 -> TypeError: '<' not supported between instances of 'str' and 'tuple' 
    #comment = [REPLACE_WITH_SPACE.sub(" ", line) for line in comment]
    #pos = mecab.pos(str(comment).strip()) 
    #return (length_clip(pos), label)

## optimizer sgd로 변경1 -> 81%
    # trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': .1})
    
    # optimizer sgd로 변경2 -> 50%?????
    # args_lr = 1.0
    # trainer = gluon.Trainer(model.collect_params(),'sgd', {'learning_rate': args_lr, 'momentum': 0, 'wd': 0})

## fasttext add -> 84% 그대로
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko', load_ngrams=True)
vocab.set_embedding(fasttext_simple)

## Sentiment Analysis (SA) with pre-trained Language Model (LM) -> 87%
[Epoch 0] train avg loss 0.001483, test acc 0.87, test avg loss 0.313094, throughput 9.40K wps

## Sentiment Analysis (SA) with pre-trained Language Model (LM)_ko -> 55%
[Epoch 0] train avg loss 0.022161, test acc 0.55, test avg loss 0.660891, throughput 4.59K wps

## Sentiment Analysis (SA) with pre-trained Language Model (LM)_ko_sejong_dataset -> 81%
[Epoch 0] train avg loss 0.013356, test acc 0.81, test avg loss 0.367918, throughput 4.44K wps


# exp1. vocab을 sejong_dataset으로 변경해서 진행 -> 82%

In [28]:
#sejong_dataset = nlp.data.dataset.CorpusDataset('KoWordSpacing/input.txt', tokenizer=lambda x:mecab.morphs(x.strip()))
#counter = nlp.data.count_tokens(itertools.chain.from_iterable(sejong_dataset))
#vocab = nlp.Vocab(counter, unknown_token='<unk>', padding_token=None, bos_token=None, eos_token=None, min_freq=15)

# exp2. Attaching word embeddings -> 83%


In [29]:
#fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko', load_ngrams=True)
#vocab.set_embedding(fasttext_simple)


# exp3. Creating Vocabulary from Pre-trained Word Embeddings -> 55%


In [30]:
#glove_6b50d = nlp.embedding.create('glove', source='glove.6B.50d')
#vocab = nlp.Vocab(nlp.data.Counter(glove_6b50d.idx_to_token))
#vocab.set_embedding(glove_6b50d)

# exp4. skit learn


In [2]:
df = pd.read_csv("ratings.txt", sep='\t', keep_default_na=False)

In [3]:
df.head(n=3)

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1


In [4]:
text = df['document'].as_matrix()
y = df['label'].as_matrix()

  """Entry point for launching an IPython kernel.
  


In [5]:
from sklearn.model_selection import train_test_split

text_train, text_test, y_train, y_test = train_test_split(text, y, test_size=0.33, random_state=42)

In [7]:
from konlpy.tag import Twitter
twitter_tag = Twitter()

In [8]:
def twitter_tokenizer(text):
    return twitter_tag.morphs(text)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [12]:
def mecab_tokenizer(text):
    return mecab.morphs(text)

In [None]:
mecab_param_grid = {'tfidfvectorizer__min_df':[3,5,7],
                   'tfidfvectorizer__ngram_range':[(1,1), (1,2), (1,3)],
                    'logisticregression__C':[0.1, 1, 10, 100]}
mecab_pipe = make_pipeline(TfidfVectorizer(tokenizer=mecab_tokenizer), LogisticRegression())
mecab_grid = GridSearchCV(mecab_pipe, mecab_param_grid, n_jobs=-1)

mecab_grid.fit(text_train, y_train)
print("최상의 교차 검증 점수 {:.3f}".format(mecab_grid.best_score_))
print("최적의 교차 검증 매개변수 ", mecab_grid.best_params_)

In [None]:
X_test_mecab = mecab_grid.best_estimator_.named_steps['tfidfvectorizer'].transform(text_test)
score = mecab_grid.best_estimator_.named_steps['logisticregression'].score(X_test_mecab, y_test)
print("테스트 세트 점수: :{:.3f}".format(score))

In [None]:
twit_param_grid = {'tfidfvectorizer__min_df':[3,5,7],
                   'tfidfvectorizer__ngram_range':[(1,1), (1,2), (1,3)],
                    'logisticregression__C':[0.1, 1, 10]}
twit_pipe = make_pipeline(TfidfVectorizer(tokenizer=twitter_tokenizer), LogisticRegression())
twit_grid = GridSearchCV(twit_pipe, twit_param_grid, n_jobs=-1)

twit_grid.fit(text_train[0:1000], y_train[0:1000])
print("최상의 교차 검증 점수: {:.3f}".format(twit_grid.best_score_))
print("최적의 교차 검증 매개변수 ", twit_grid.best_params_)

In [None]:
X_test_knolpy = grid.best_estimator_.named_steps['tfidfvectorizer'].transform(text_test)
score = grid.best_estimator_.named_steps['logisticregression'].score(X_test_knolpy, y_test)
print("테스트 세트 점수 : {:.3f}".format(score))
