<a href="https://colab.research.google.com/github/moonjune/test-repo/blob/master/knlp_korean_preproc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!git clone https://github.com/e9t/nsmc.git

In [0]:
import os 
os.chdir('/content/nsmc')
!ls
from google.colab import files
files.upload()

In [0]:
os.chdir('/content/')
!git clone https://github.com/NLP-kr/tensorflow-ml-nlp.git
import os
os.chdir('/content/tensorflow-ml-nlp')
!pip install -r requirements.txt

In [0]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
%matplotlib inline

os.chdir('/content/nsmc')

In [0]:
DATA_IN_PATH = '/content/nsmc/'
print("파일크기: ")
for file in os.listdir(DATA_IN_PATH):
  if 'txt'in file:
   print(file.ljust(30) + str(round(os.path.getsize(DATA_IN_PATH + file) / 1000000, 2)) + 'MB')

In [0]:
train_data = pd.read_csv(DATA_IN_PATH + 'ratings_train.txt', header = 0, delimiter = '\t', quoting = 3)
train_data.head()

In [0]:
print('전체 학습 데이터의 개수: {}'.format(len(train_data)))

In [0]:
train_length = train_data['document'].astype(str).apply(len)
train_length.head()

In [0]:
plt.figure(figsize = (12,5))
plt.hist(train_length, bins = 200, alpha = 0.5, color = 'r', label = 'word')
plt.yscale('log', nonposy = 'clip')
plt.title('Log-Histogram of length of review')
plt.xlabel('Length of review')
plt.ylabel('Number of review')

In [0]:
train_review = [review for review in train_data['document'] if type(review) is str]

In [0]:
wordcloud = WordCloud(font_path = DATA_IN_PATH + 'NanumGothic.ttf').generate(' '.join(train_review))

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [0]:
fig, axe = plt.subplots(ncols = 1)
fig.set_size_inches(6, 3)
sns.countplot(train_data['label'])

In [0]:
train_word_counts = train_data['document'].astype('str').apply(lambda x: len(x.split(' ')))

plt.figure(figsize = (15, 10))
plt.hist(train_word_counts, bins = 50, facecolor = 'r', label = 'train')
plt.title('Log-Hist', fontsize = 15)
plt.yscale('log', nonposy = 'clip')
plt.legend()
plt.xlabel('Num_word', color = 'w')
plt.ylabel('Num_review', color = 'w')

In [0]:
# 데이터 전처리

import numpy as np
import pandas as pd
import re
import json
from konlpy.tag import Okt
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

DATA_IN_PATH = '/content/nsmc/'

train_data = pd.read_csv(DATA_IN_PATH + 'ratings_train.txt', header = 0, delimiter = '\t', quoting = 3)

In [0]:
train_data['document'][:5]

In [0]:
review_text = re.sub("[^가-힣ㄱ-하-ㅣ\\s]","",train_data['document'][0])
print(review_text)

In [0]:
okt = Okt()
review_text = okt.morphs(review_text, stem=True)
print(review_text)

In [0]:
stop_words = set(['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한'])
clean_review = [token for token in review_text if not token in stop_words]
clean_review

In [0]:
def preprocessing(review, okt, remove_stopwords = False, stop_words = []):
#   함수의 인자는 다음과 같다.
#   review: 전처리할 텍스트
#   okt: okt 객체를 반복적으로 생성하지 않고 미리 생성한 후 인자로 받는다.
#   remove_stopword: 불용어를 제거할지 여부 선택
#   stop_word: 불용어 사전은 사용자가 직접 입력해야 함.
#   1. 한글 및 공백을 제외한 문자를 모두 제거
  review_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]","",review)
  
#   2. okt 객체를 이용해 형태소 단위로 나눈다.
  word_review = okt.morphs(review_text, stem= True)
  
  if remove_stopwords:
    word_review = [token for token in word_review if not token in stop_words]
  
  return word_review

In [0]:
stop_words = set(['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한'])
okt = Okt()
clean_train_review = []

for review in train_data['document']:
  # 비어있는 데이터에서 멈추지 않도록 문자열인 경우에만 진행
  if type(review) == str:
    clean_train_review.append(preprocessing(review, okt, remove_stopwords = True, stop_words = stop_words))
  else:
    clean_train_review.append([]) # string이 아니면 비어있는 값 추가

clean_train_review[:4]

In [0]:
# clean_train_review_csv = pd.DataFrame(clean_train_review)
# clean_train_review_csv = clean_train_review_csv.to_csv('/content/nsmc/clean_train_review_csv.csv', encoding='ms949')

In [0]:
!ls

In [0]:
# from google.colab import files
# files.download('/content/nsmc/clean_train_review_csv.csv') 

In [0]:
test_data = pd.read_csv(DATA_IN_PATH + 'ratings_test.txt',header = 0, delimiter = '\t', quoting = 3)

clean_test_review = []

for review in test_data['document']:
  # 빈 데이터에서 멈추지 않도록 문자열인 경우만 진행
  if type(review) == str:
    clean_test_review.append(preprocessing(review, okt, remove_stopwords = True, stop_words = stop_words))
  else:
    clean_test_review.append([]) # string이 아니면 비어있는 값 추가

In [0]:
clean_test_review[:10]

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_review)
train_sequences = tokenizer.texts_to_sequences(clean_train_review)
test_sequences = tokenizer.texts_to_sequences(clean_test_review)

word_vocab = tokenizer.word_index

MAX_SEQUENCE_LENGTH = 8

train_inputs = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')
train_labels = np.array(train_data['label'])

test_inputs = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')
test_labels = np.array(test_data['label'])

In [0]:
# 전처림 저장하기
DATA_IN_PATH = '/content/nsmc/'
TRAIN_INPUT_DATA = 'nsmc_train_input.npy'
TRAIN_LABEL_DATA = 'nsmc_train_label.npy'
TEST_INPUT_DATA = 'nsmc_test_input.npy'
TEST_LABEL_DATA = 'nsmc_test_label.npy'
DATA_CONFIGS = 'data_configs.json'

data_configs = {}

data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab)+1 

import os
if not os.path.exists(DATA_IN_PATH):
  os.makedirs(DTA_IN_PATH)

np.save(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'wb'), train_inputs)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'wb'), train_labels)

np.save(open(DATA_IN_PATH + TEST_INPUT_DATA, 'wb'), test_inputs)
np.save(open(DATA_IN_PATH + TEST_LABEL_DATA, 'wb'), test_labels)

json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'), ensure_ascii = False)

In [0]:
# cnn 방법을 적용할 예정
import os
from datetime import datetime
import tensorflow as tf
import numpy as np
import json
from sklearn.model_selection import train_test_split

In [0]:
DATA_IN_PATH = '/content/nsmc/'
DATA_OUT_PATH = '/content/nsmc/data_out/'
INPUT_TRAIN_DATA_FILE_NAME = 'nsmc_train_input.npy'
LABEL_TRAIN_DATA_FILE_NAME = 'nsmc_train_label.npy'
DATA_CONFIGS_FILE_NAME = 'data_configs.json'

input_data = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'rb'))
label_data = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS_FILE_NAME, 'r'))

In [0]:
TEST_SPLIT = 0.1
RNG_SEED = 13371447
VOCAB_SIZE = prepro_configs['vocab_size']
EMB_SIZE = 128
BATCH_SIZE = 16
NUM_EPOCHS = 1

input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size = TEST_SPLIT, random_state = RNG_SEED)

In [0]:
def mapping_fn(X, Y):
  input, label = {'x': X}, Y
  return input, label

def train_input_fn():
  dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
  dataset = dataset.shuffle(buffer_size = len(input_train))
  dataset = dataset.batch(BATCH_SIZE)
  dataset = dataset.map(mapping_fn)
  dataset = dataset.repeat(count=NUM_EPOCHS)
  iterator = dataset.make_one_shot_iterator()
  
  return iterator.get_next()

def eval_input_fn():
  dataset = tf.data.Dataset.from_tensor_slices((input_eval, label_eval))
  dataset = dataset.shuffle(buffer_size = len(input_eval))
  dataset = dataset.batch(16)
  dataset = dataset.map(mapping_fn)
  iterator = dataset.make_one_shot_iterator()

  return iterator.get_next()

In [0]:
def model_fn(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT

    embedding_layer = tf.keras.layers.Embedding(
                    VOCAB_SIZE,
                    EMB_SIZE)(features['x'])

    dropout_emb = tf.keras.layers.Dropout(rate = 0.2)(embedding_layer)
    
    conv = tf.keras.layers.Conv1D(
           filters=32,
           kernel_size=3,
           padding='same',
           activation=tf.nn.relu)(dropout_emb)
  
    pool = tf.keras.layers.GlobalMaxPool1D()(conv)

    hidden = tf.keras.layers.Dense(units=250, activation=tf.nn.relu)(pool)   


    dropout_hidden = tf.keras.layers.Dropout(rate=0.2)(hidden, training = TRAIN)
    logits = tf.keras.layers.Dense(units=1)(dropout_hidden)

    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
        
    if TRAIN:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)
    
    elif EVAL:
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})
        
    elif PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits),
            }
        )

In [0]:
est = tf.estimator.Estimator(model_fn, model_dir="data_out/checkpoint/cnn_model")

In [0]:
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

est.train(train_input_fn)

time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

In [0]:
valid = est.evaluate(eval_input_fn)

In [0]:
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'

test_input_data = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
test_label_data = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

In [0]:
def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_input_data, test_label_data))
    dataset = dataset.batch(16)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [0]:
predict = est.evaluate(test_input_fn)

In [0]:
type(predict)

In [0]:
predict