In [27]:
# 데이터를 전처리 하기 위하여, 아래에서 사용할 라이브러리는 다음과 같다.

import re # 정규표현식
from bs4 import BeautifulSoup # 
from nltk.corpus import stopwords
import nltk

import numpy as np
import pandas as pd

from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing import sequence

In [28]:
#필요한 데이터를 Pandas 라이브러리로 불러온다.
DEFAULT_PATH ='~/.kaggle/competitions/word2vec-nlp-tutorial/'
train = pd.read_csv(DEFAULT_PATH+"labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv(DEFAULT_PATH+"testData.tsv", header=0, delimiter="\t", quoting=3)

In [29]:
#데이터의 구조는 "id", "sentiment", "review"로 구분되어 있으며, "review"를 보고 "sentiment"를 판단하는 문장
train.head()
#raw text는 다음과 같은 구조로 되어 있다. 내부에 html과 관련된 특수기호 및 문자 (html 태그) ]들이 들어가 있다.
train.review[1]

'"\\"The Classic War of the Worlds\\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \\"critics\\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \\"critics\\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \\"critics\\" perceive to be its shortcomings."'

In [30]:
def preprocessing( review, remove_stopwords = False ): 
    # 불용어 제거는 옵션으로 선택 가능하다.
    
    # 1. HTML 태그 제거
    review_text = BeautifulSoup(review, "html5lib").get_text()	
	
    # 2. 영어가 아닌 특수문자들을 공백(" ")으로 바꾸기
    review_text = re.sub("[^a-zA-Z]", " ", review_text)

    # 3. 대문자들을 소문자로 바꾸고 공백단위로 텍스트들 나눠서 리스트로 만든다.
    words = review_text.lower().split()
	
    if remove_stopwords: 
        # 4. 불용어들을 제거
    
        #영어에 관련된 불용어 불러오기
        stops = set(stopwords.words("english"))
        # 불용어가 아닌 단어들로 이루어진 새로운 리스트 생성
        words = [w for w in words if not w in stops]
        # 5. 단어 리스트를 공백을 넣어서 하나의 글로 합친다.	
        clean_review = ' '.join(words)
    	
    else: # 불용어 제거하지 않을 때
        clean_review = ' '.join(words)

    return clean_review

In [31]:
#train데이터를 전처리
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(preprocessing(review, remove_stopwords = True))
 
#전처리한 데이터의 첫 번째 데이터 출력
clean_train_reviews[0]

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

In [32]:
#test데이터를 전처리
clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(preprocessing(review, remove_stopwords = True))
 
#전처리한 데이터의 첫 번째 데이터 출력
clean_test_reviews[0]

'naturally film main themes mortality nostalgia loss innocence perhaps surprising rated highly older viewers younger ones however craftsmanship completeness film anyone enjoy pace steady constant characters full engaging relationships interactions natural showing need floods tears show emotion screams show fear shouting show dispute violence show anger naturally joyce short story lends film ready made structure perfect polished diamond small changes huston makes inclusion poem fit neatly truly masterpiece tact subtlety overwhelming beauty'

In [33]:
reviews = list(train['review'])
sentiments = list(train['sentiment'])

num_reviews = train["review"].size
print('전체 리뷰 갯수: ', num_reviews)

전체 리뷰 갯수:  25000


In [34]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_reviews+clean_test_reviews)

#train
train_text_sequences = tokenizer.texts_to_sequences(clean_train_reviews)
#test
test_text_sequences = tokenizer.texts_to_sequences(clean_test_reviews)

word_vocab = tokenizer.word_index
print("전체 단어 갯수: ", len(word_vocab))

전체 단어 갯수:  101246


In [35]:
len(test_text_sequences)

25000

In [36]:
MAX_SEQUENCE_LENGTH = 50

train_inputs = pad_sequences(train_text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
train_labels = np.array(train['sentiment'])

test_inputs = pad_sequences(test_text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print('Shape of input train data tensor:', train_inputs.shape)
print('Shape of label train tensor:', train_labels.shape)
print('Shape of input test data tensor:', test_inputs.shape)

Shape of input train data tensor: (25000, 50)
Shape of label train tensor: (25000,)
Shape of input test data tensor: (25000, 50)


In [37]:
FILE_DIR_PATH = './data/'
INPUT_TRAIN_DATA_FILE_NAME = 'train_input.npy'
LABEL_TRAIN_DATA_FILE_NAME = 'train_label.npy'
INPUT_TEST_DATA_FILE_NAME = 'test_input.npy'

DATA_CONFIGS_FILE_NAME = 'data_configs.json'

In [38]:
import os

if not os.path.exists(FILE_DIR_PATH):
    os.makedirs(FILE_DIR_PATH)

np.save(open(FILE_DIR_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'wb'), train_inputs)
np.save(open(FILE_DIR_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'wb'), train_labels)
np.save(open(FILE_DIR_PATH + INPUT_TEST_DATA_FILE_NAME, 'wb'), test_inputs)

data_configs = word_vocab
data_configs['vocab_size'] = len(word_vocab) # vocab size 추가

import json

with open(FILE_DIR_PATH + DATA_CONFIGS_FILE_NAME, 'w') as f:
    json.dump(data_configs, f)