In [1]:
import requests, os
import pandas as pd


base_path = os.path.expanduser('~/dataset/naver_movie_review')
if not os.path.isdir(base_path):
    os.mkdir(base_path)

with open(os.path.join(base_path, 'train.txt'), 'wb') as f:
    f.write(requests.get('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt').content)
with open(os.path.join(base_path, 'test.txt'), 'wb') as f:
    f.write(requests.get('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt').content)
    
train_df = pd.read_table(os.path.join(base_path, 'train.txt'))
test_df = pd.read_table(os.path.join(base_path, 'test.txt'))
# train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

In [2]:
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [3]:
print(len(train_df))
print(len(test_df))

150000
50000


In [4]:
# define field
import re
from torchtext import data
from konlpy.tag import Mecab


stopwords = {'의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다'}
tokenizer = Mecab()

def preprocessing(sentence):
    sentence = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', sentence)
    sentence = tokenizer.morphs(sentence)
    sentence = [word for word in sentence if word not in stopwords]
    return sentence


ID = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=preprocessing, lower=True, batch_first=True, fix_length=20)
LABEL = data.Field(sequential=False, use_vocab=False, is_target=True)

In [5]:
# make Dataset
from torchtext.data import TabularDataset
import time

start = time.time()
train_data, test_data = TabularDataset.splits(
    path=base_path, train='train.txt', test='test.txt', format='tsv',
    fields=[('id', ID), ('text', TEXT), ('label', LABEL)], skip_header=True)
print(time.time() - start)
print(len(train_data), len(test_data))

26.6509051322937
150000 50000


In [6]:
vars(train_data[0])

{'id': '9976970',
 'text': ['아', '더', '빙', '진짜', '짜증', '나', '네요', '목소리'],
 'label': '0'}

In [7]:
# make Vocabulary
TEXT.build_vocab(train_data, min_freq=10, max_size=10000)
print(len(TEXT.vocab))

9626


In [11]:
print(TEXT.vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f2c4471b128>>, {'<unk>': 0, '<pad>': 1, '영화': 2, '다': 3, '고': 4, '하': 5, '을': 6, '보': 7, '게': 8, '지': 9, '있': 10, '없': 11, '좋': 12, '나': 13, '었': 14, '만': 15, '는데': 16, '너무': 17, '봤': 18, '적': 19, '안': 20, '정말': 21, '로': 22, '음': 23, '것': 24, '재밌': 25, '네요': 26, '아': 27, '어': 28, '지만': 29, '같': 30, '진짜': 31, '에서': 32, '했': 33, '기': 34, '네': 35, '점': 36, '않': 37, '거': 38, '았': 39, '수': 40, '되': 41, '면': 42, 'ㅋㅋ': 43, '인': 44, '말': 45, '연기': 46, '최고': 47, '주': 48, '내': 49, '평점': 50, '이런': 51, '던': 52, '어요': 53, '할': 54, '왜': 55, '겠': 56, '스토리': 57, 'ㅋㅋㅋ': 58, '습니다': 59, '해': 60, '듯': 61, '아니': 62, '드라마': 63, '생각': 64, '더': 65, '그': 66, '싶': 67, '사람': 68, '감동': 69, '때': 70, '함': 71, '배우': 72, '본': 73, '까지': 74, '뭐': 75, '볼': 76, '알': 77, '만들': 78, '내용': 79, '보다': 80, '감독': 81, '라': 82, '재미': 83, '그냥': 84, '시간': 85, '재미있': 86, '지루': 87, '중': 88, '잼': 89, '재미없': 90, '였': 91, '년': 92, '쓰레기': 93, '사랑': 9

In [14]:
# make Dataloader
from torchtext.data import Iterator


batch_size = 5
train_loader = Iterator(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = Iterator(dataset=test_data, batch_size=batch_size, shuffle=False)
print(len(train_loader))
print(len(test_loader))

30000
10000


In [18]:
batch = next(iter(train_loader))  # first mini-batch
print(batch.text.size())
print(batch.text)  # batch_size x fix_length
print(batch.label)

torch.Size([5, 20])
tensor([[  21, 3886, 1546,  117,  753,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [4103, 3523,  115, 2533, 2220,  185,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [ 205,  419,  109,   18,  505,   43,   43, 1723,  111,   21,  165,   33,
           97,  266,  267,    3,   43,   43, 1723,    1],
        [ 288,  188, 2231,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1],
        [1813,   73,    2,   88,  366,   83,   11,   14,    3, 1397,  226,   11,
            4,  414,  273,    2,   13,  337,   27, 1382]])
tensor([1, 0, 0, 1, 0])
