In [2]:
# 데이터를 전처리 하기 위하여, 아래에서 사용할 라이브러리는 다음과 같다.

import re # 정규표현식
from bs4 import BeautifulSoup # 
from nltk.corpus import stopwords

import numpy as np
import pandas as pd

from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing import sequence

In [4]:
#필요한 데이터를 Pandas 라이브러리로 불러온다.
DEFAULT_PATH ='~/.kaggle/competitions/word2vec-nlp-tutorial/'
train = pd.read_csv(DEFAULT_PATH+"labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [7]:
#데이터의 구조는 "id", "sentiment", "review"로 구분되어 있으며, "review"를 보고 "sentiment"를 판단하는 문장

train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [19]:
#raw text는 다음과 같은 구조로 되어 있다. 내부에 html과 관련된 특수기호 및 문자 (html 태그) ]들이 들어가 있다.
train.review[1]

'"\\"The Classic War of the Worlds\\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \\"critics\\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \\"critics\\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \\"critics\\" perceive to be its shortcomings."'

In [20]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
example_sent = "This is a sample sentence, showing off the stop words filtration."
 
stop_words = set(stopwords.words('english'))
 
word_tokens = word_tokenize(example_sent)
 
filtered_sentence = [w for w in word_tokens if not w in stop_words]
 
filtered_sentence = []
 
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
 
print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [21]:
reviews = list(train['review'])
sentiments = list(train['sentiment'])

num_reviews = train["review"].size
print('전체 리뷰 갯수: ', num_reviews)

전체 리뷰 갯수:  25000


In [59]:
%%time

def review_to_words( raw_review, stop_words = False ):
    # 1. HTML 태그 지우기
    reviews_text = list(map(lambda x: BeautifulSoup(x, 'html.parser').get_text(), reviews))
    
    # 2. 영어가 아닌 특수문자들을 공백(" ")으로 바꾸기
    reviews_text = list(map(lambda x: re.sub("[^a-zA-Z]"," ", x), reviews_text))
    
    # 3. 대문자들을 소문자로 바꾸고 공백단위로 텍스트들을 나누기
    words = list(map(lambda x: x.lower().split(), reviews_text))
    
    if stop_words:
        # 4. 불용어들을 제거 (Optional)
        set_of_stopwords = set(stopwords.words("english")) #영어에 관련된 불용어 불러오기
        filtered_sentence = list(map(lambda x: [w for w in x if not w in set_of_stopwords], words))
    
        # 5. 불용어가 아닌 텍스트를 공백 단위로 나누어서 리턴    
        clean_review = list(map(lambda x: ' '.join(x), filtered_sentence))
        
    else:
        clean_review = list(map(lambda x: ' '.join(x), words))
    
    return clean_review

clean_test_reviews = review_to_words(list(train['review'].values), stop_words = True)

CPU times: user 8.69 s, sys: 219 ms, total: 8.91 s
Wall time: 8.94 s


In [57]:
clean_train_reviews[0]

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

In [41]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_reviews)
text_sequences = tokenizer.texts_to_sequences(clean_train_reviews)

word_vocab = tokenizer.word_index
print("전체 단어 갯수: ", len(word_vocab))

전체 단어 갯수:  74065


In [42]:
word_vocab

{'movie': 1,
 'film': 2,
 'one': 3,
 'like': 4,
 'good': 5,
 'time': 6,
 'even': 7,
 'would': 8,
 'story': 9,
 'really': 10,
 'see': 11,
 'well': 12,
 'much': 13,
 'get': 14,
 'bad': 15,
 'people': 16,
 'also': 17,
 'first': 18,
 'great': 19,
 'made': 20,
 'way': 21,
 'make': 22,
 'could': 23,
 'movies': 24,
 'think': 25,
 'characters': 26,
 'character': 27,
 'watch': 28,
 'two': 29,
 'films': 30,
 'seen': 31,
 'many': 32,
 'life': 33,
 'plot': 34,
 'acting': 35,
 'never': 36,
 'love': 37,
 'little': 38,
 'best': 39,
 'show': 40,
 'know': 41,
 'ever': 42,
 'man': 43,
 'better': 44,
 'end': 45,
 'still': 46,
 'say': 47,
 'scene': 48,
 'scenes': 49,
 'go': 50,
 'something': 51,
 'back': 52,
 'real': 53,
 'watching': 54,
 'though': 55,
 'thing': 56,
 'old': 57,
 'years': 58,
 'actors': 59,
 'director': 60,
 'work': 61,
 'another': 62,
 'new': 63,
 'nothing': 64,
 'funny': 65,
 'actually': 66,
 'makes': 67,
 'look': 68,
 'find': 69,
 'going': 70,
 'part': 71,
 'lot': 72,
 'every': 73,
 'wo

In [46]:
MAX_SEQUENCE_LENGTH = 50

inputs = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(sentiments)

print('Shape of input data tensor:', inputs.shape)
print('Shape of label tensor:', labels.shape)

Shape of input data tensor: (25000, 50)
Shape of label tensor: (25000,)


In [47]:
FILE_DIR_PATH = './data/'
INPUT_TRAIN_DATA_FILE_NAME = 'train_input.npy'
LABEL_TRAIN_DATA_FILE_NAME = 'train_label.npy'
DATA_CONFIGS_FILE_NAME = 'data_configs.json'

In [48]:
import os

if not os.path.exists(FILE_DIR_PATH):
    os.makedirs(FILE_DIR_PATH)

np.save(open(FILE_DIR_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'wb'), inputs)
np.save(open(FILE_DIR_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'wb'), labels)

In [49]:
data_prepro_configs = {'vocab': word_vocab, 'vocab_size': len(word_vocab)}

In [50]:
import json

with open(FILE_DIR_PATH + DATA_CONFIGS_FILE_NAME, 'w') as f:
    json.dump(data_prepro_configs, f)

In [37]:
%%time

def clean_reviews(reviews, remove_stopwords=False, stem=False):
    """
    to clean review strings
    review: a list of review strings
    remove_stop_words: whether to remove stop words
    output: a list of clean reviews
    """
    # 1. Remove HTML
    reviews_text = list(map(lambda x: BeautifulSoup(x, 'html.parser').get_text(), reviews))
    #
    # 2. Remove non-letters
    reviews_text = list(map(lambda x: re.sub("[^a-zA-Z]"," ", x), reviews_text))
    #
    # 3. Convert words to lower case and split them
    words = list(map(lambda x: x.lower().split(), reviews_text))
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        set_of_stopwords = set(stopwords.words("english"))
        meaningful_words = list(map(lambda x: [w for w in x if not w in set_of_stopwords], words))
    
    # 5. Optionally stem the words
    if stem:
        porter_stemmer = PorterStemmer()
        wordnet_lemmatizer = WordNetLemmatizer()
        stemmed_words = list(map(lambda x: [porter_stemmer.stem(w) for w in x], meaningful_words))
        stemmed_words = list(map(lambda x:[wordnet_lemmatizer.lemmatize(w) for w in x], stemmed_words))
    
        # 6. Join the words to a single string
        clean_review = map(lambda x: ' '.join(x), stemmed_words)
    else:
        clean_review = list(map(lambda x: ' '.join(x), meaningful_words))
    
    return clean_review

clean_test_reviews = clean_reviews(list(train['review'].values), remove_stopwords=True)

%%time

def review_to_words( raw_review ):
    # 1. HTML 태그 지우기
    review_text = BeautifulSoup(raw_review, "html5lib").get_text()
    
    # 2. 영어가 아닌 특수문자들을 공백(" ")으로 바꾸기
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. 대문자들을 소문자로 바꾸고 공백단위로 텍스트들을 나누기
    words = letters_only.lower().split()
    
    # 4. 불용어들을 제거
    filtered_sentence = [w for w in words if not w in stop_words]
    
    # 5. 불용어가 아닌 텍스트를 공백 단위로 나누어서 리턴
    return( " ".join( filtered_sentence ))

clean_train_reviews = list()
for i, r in enumerate(reviews):
    if((i+1)%1000 == 0):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_train_reviews.append(review_to_words(r))

Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000

CPU times: user 46.6 s, sys: 187 ms, total: 46.8 s
Wall time: 47 s
