In [15]:
import re

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing import sequence

In [2]:
default_path='~/.kaggle/competitions/word2vec-nlp-tutorial/'
train = pd.read_csv(default_path+"labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [8]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [21]:
reviews = list(train['review'])
sentiments = list(train['sentiment'])

In [4]:
num_reviews = train["review"].size
print('전체 리뷰 갯수: ', num_reviews)

전체 리뷰 갯수:  25000


In [13]:
def review_to_words( raw_review ):
    # 1. HTML 태그 지우기
    review_text = BeautifulSoup(raw_review, "html5lib").get_text()
    
    # 2. 영어가 아닌 특수문자들을 공백(" ")으로 바꾸기
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. 대문자들을 소문자로 바꾸고 공백단위로 텍스트들을 나누기
    words = letters_only.lower().split()
    
    # 6. 불용어가 아닌 텍스트를 공백 단위로 나누어서 리턴
    return( " ".join( words ))

In [14]:
clean_train_reviews = list()
for i, r in enumerate(reviews):
    if((i+1)%1000 == 0):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_train_reviews.append(review_to_words(r))

Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



In [27]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_reviews)
text_sequences = tokenizer.texts_to_sequences(clean_train_reviews)

word_vocab = tokenizer.word_index
print("전체 단어 갯수: ", len(word_vocab))

전체 단어 갯수:  74218


In [23]:
MAX_SEQUENCE_LENGTH = 50

inputs = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(sentiments)

print('Shape of input data tensor:', inputs.shape)
print('Shape of label tensor:', labels.shape)

Shape of input data tensor: (25000, 50)
Shape of label tensor: (25000,)


In [37]:
FILE_DIR_PATH = './data/'
INPUT_TRAIN_DATA_FILE_NAME = 'input.npy'
LABEL_TRAIN_DATA_FILE_NAME = 'label.npy'
DATA_CONFIGS_FILE_NAME = 'data_configs.json'

In [33]:
import os

if not os.path.exists(FILE_DIR_PATH):
    os.makedirs(FILE_DIR_PATH)

np.save(open(FILE_DIR_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'wb'), inputs)
np.save(open(FILE_DIR_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'wb'), labels)

In [34]:
data_prepro_configs = {'vocab': word_vocab, 'vocab_size': len(word_vocab)}

In [39]:
import json

with open(FILE_DIR_PATH + DATA_CONFIGS_FILE_NAME, 'w') as f:
    json.dump(data_prepro_configs, f)