In [1]:
import requests, os, re, random
import pandas as pd
import torch
from torchtext import data
from torchtext.data import TabularDataset, BucketIterator
from konlpy.tag import Mecab

path_base = os.path.expanduser('~/dataset/naver_movie_review')
if not os.path.isdir(path_base):
    os.makedirs(path_base)
data_ls = os.listdir(path_base)
stopwords = {'의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다'}
tokenizer = Mecab()
SEED = 5
random.seed(SEED)
torch.manual_seed(SEED)
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:
def preprocessing(sentence):
    # sentence = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', sentence)
    sentence = tokenizer.morphs(sentence)
    # sentence = [word for word in sentence if word not in stopwords]
    return sentence

In [4]:
# define Field
ID = data.Field(sequential=False, use_vocab=False)
# fix_length of TEXT is not necessary: refer to BucketIterator
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=preprocessing, lower=True, batch_first=True)
LABEL = data.Field(sequential=False, use_vocab=False, is_target=True, batch_first=True)
fields = {
    'ID': ID,
    'TEXT': TEXT,
    'LABEL': LABEL
}

# make Dataset
trainset, testset = TabularDataset.splits(path=path_base, train='train_pre.csv', test='test_pre.csv',
                                          format='csv', fields=[('id', ID), ('text', TEXT), ('label', LABEL)],
                                          skip_header=True)
trainset, valset = trainset.split(split_ratio=0.8)
print(f'{"="*10} Loading data succeeded {"="*10}')
print(f'# train: {len(trainset)}\n# val: {len(valset)}\n# test:{len(testset)}')



# train: 119996
# val: 29999
# test:49997


NameError: name 'min_req' is not defined

In [43]:
# make Vocab
TEXT.build_vocab(trainset, min_freq=10, max_size=10000)
# TEXT.build_vocab(valset, min_freq=min_req, max_size=max_size)
print(f'# vocab: {len(TEXT.vocab)}')

# vocab: 8372


In [10]:
vars(trainset[0])

{'id': '5947113', 'text': ['평점', '조절', '위원회'], 'label': '0'}

In [46]:
vars(valset[0])

{'id': '2920987',
 'text': ['인생', '자신', '선택', '의해', '결정', '된다', '많이', '감동', '받', '음'],
 'label': '1'}

In [34]:
vars(testset[0])

{'id': '6270596', 'text': ['굳', 'ㅋ'], 'label': '1'}

In [18]:
TEXT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f7c41575860>>,
            {'<unk>': 0,
             '<pad>': 1,
             '영화': 2,
             '다': 3,
             '고': 4,
             '하': 5,
             '을': 6,
             '보': 7,
             '게': 8,
             '지': 9,
             '있': 10,
             '없': 11,
             '좋': 12,
             '나': 13,
             '었': 14,
             '만': 15,
             '는데': 16,
             '너무': 17,
             '봤': 18,
             '적': 19,
             '안': 20,
             '정말': 21,
             '로': 22,
             '음': 23,
             '것': 24,
             '재밌': 25,
             '네요': 26,
             '아': 27,
             '어': 28,
             '지만': 29,
             '같': 30,
             '진짜': 31,
             '에서': 32,
             '기': 33,
             '했': 34,
             '네': 35,
             '점': 36,
             '않': 37,
             '았': 38,
             '거': 39,
        

In [36]:
# make Dataloader
train_loader, val_loader, test_loader = BucketIterator.splits((trainset, valset, testset), batch_sizes=(64, 64 , 64),
                                                              sort_key=lambda x: len(x.text), shuffle=True, repeat=False)
print(f'# train_batch: {len(train_loader)}\n# val_batch: {len(val_loader)}\n# test_batch:{len(test_loader)}')

# train_batch: 1875
# val_batch: 469
# test_batch:782


In [44]:
itr_train = iter(train_loader)
d1 = next(itr_train)
print(d1)
print(d1.text)


[torchtext.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.text]:[torch.LongTensor of size 64x51]
	[.label]:[torch.LongTensor of size 64]
tensor([[  31,  233,  117,  ...,    1,    1,    1],
        [4041,   87, 3342,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        ...,
        [   0,    4,  582,  ...,    1,    1,    1],
        [2294,  170,    0,  ...,    1,    1,    1],
        [  92,  170,   13,  ...,    1,    1,    1]])


In [45]:
itr_val = iter(val_loader)
d2 = next(itr_val)
print(d2)
print(d2.text)


[torchtext.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.text]:[torch.LongTensor of size 64x0]
	[.label]:[torch.LongTensor of size 64]
tensor([], size=(64, 0), dtype=torch.int64)


In [25]:
d2.label

tensor([1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
        1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
        1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0])

In [26]:
itr_test = iter(test_loader)
d3 = next(itr_test)
print(d3)
print(d3.text)


[torchtext.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.text]:[torch.LongTensor of size 64x0]
	[.label]:[torch.LongTensor of size 64]
tensor([], size=(64, 0), dtype=torch.int64)


In [29]:
def splits(datasets, batch_sizes=None, **kwargs):
    if batch_sizes is None:
        batch_sizes = [kwargs.pop('batch_size')] * len(datasets)
    print(batch_sizes)

In [32]:
splits((1,2,3), batch_size=123)

[123, 123, 123]


In [23]:
# define Field
ID = data.Field(sequential=False, use_vocab=False)
# fix_length of TEXT is not necessary: refer to BucketIterator
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=preprocessing, lower=True, batch_first=True)
LABEL = data.Field(sequential=False, use_vocab=False, is_target=True, batch_first=True)
fields = {
    'ID': ID,
    'TEXT': TEXT,
    'LABEL': LABEL
}

# make Dataset
trainset, testset = TabularDataset.splits(path=path_base, train='train_pre.csv', test='test_pre.csv',
                                          format='csv', fields=[('id', ID), ('text', TEXT), ('label', LABEL)],
                                          skip_header=True)
print(f'{"="*10} Loading data succeeded {"="*10}')
print(f'# train: {len(trainset)}\n# test:{len(testset)}')

# train: 149995
# test:49726


In [24]:
print(f'{"="*10} Loading data succeeded {"="*10}')
print(f'# train: {len(trainset)}\n# test:{len(testset)}')

# train: 149995
# test:49726


In [25]:
TEXT.build_vocab(trainset, min_freq=10, max_size=10000)
# TEXT.build_vocab(valset, min_freq=min_req, max_size=max_size)
print(f'# vocab: {len(TEXT.vocab)}')

# vocab: 9643


In [26]:
vars(testset[0])

{'id': '6270596', 'text': ['굳', 'ㅋ'], 'label': '1'}

In [27]:
# make Dataloader
train_loader, test_loader = BucketIterator.splits((trainset, testset), batch_size=64,
                                                              sort_key=lambda x: len(x.text), shuffle=True, repeat=False)
print(f'# train_batch: {len(train_loader)}\n# test_batch:{len(test_loader)}')

# train_batch: 2344
# test_batch:777


In [8]:
itr_train = iter(train_loader)
d1 = next(itr_train)
print(d1)
print(d1.text)


[torchtext.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.text]:[torch.LongTensor of size 64x57]
	[.label]:[torch.LongTensor of size 64]
tensor([[ 175,   36,   24,  ...,    1,    1,    1],
        [   0, 6951,    2,  ...,    1,    1,    1],
        [ 335,    0,    0,  ...,    1,    1,    1],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [ 419,    4,    3,  ...,    1,    1,    1],
        [ 176,  103, 2900,  ...,    1,    1,    1]])


In [9]:
d1 = next(itr_train)
print(d1)
print(d1.text)


[torchtext.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.text]:[torch.LongTensor of size 64x70]
	[.label]:[torch.LongTensor of size 64]
tensor([[248, 173,   5,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [422,   7,   5,  ...,   1,   1,   1],
        ...,
        [105,  67,   1,  ...,   1,   1,   1],
        [ 47, 143, 603,  ...,   1,   1,   1],
        [ 38,  60,   5,  ...,   1,   1,   1]])


In [28]:
itr_test = iter(test_loader)
d2 = next(itr_test)
print(d2)
print(d2.text)


[torchtext.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.text]:[torch.LongTensor of size 64x0]
	[.label]:[torch.LongTensor of size 64]
tensor([], size=(64, 0), dtype=torch.int64)


In [29]:
d2.id

tensor([ 8079794,  4161885,  8393522,  6067246, 10138267,  1982992, 10177712,
         1797055, 10021751,  7337012,  5286121,  7348741,  8077864,  5483647,
         6435140,  9996875,  1123323,  9534025,  8543790,   113251,  9006553,
         7992510,  6413247,  3256591,  8714325,  8775828,  3043867,  9304560,
        10011010,  5141299,  7035146,  4565988,  4759306,  9581799, 10180530,
        10106669,  6670130,  8088501,  5868751,  9692300,  9241145,  6765042,
         3883426,  9072044,  9999726,  6067158,  6124663,  6257695,  8556238,
         4779226,  8090803, 10014334,  4140836,  5615319, 10060164,  8751085,
         5834795,  8205328,  8274861,  8635456,  8181742,  8768753,  8845712,
         7545542])

In [30]:
test_df.loc[test_df['id'] == 8079794]

Unnamed: 0,id,document,label
19651,8079794,,1


In [35]:
a = test_df.loc[test_df['id'] == 8079794]

In [41]:
s = a['document'].item()

  """Entry point for launching an IPython kernel.


In [44]:
s

'   '

In [47]:
print(type(s))
print(len(s))

<class 'str'>
3


In [49]:
test_df['document'] = test_df['document'].str.strip()

In [58]:
a2 = test_df.loc[test_df['id'] == 8079794]
a2

Unnamed: 0,id,document,label
19651,8079794,,1


In [59]:
s2 = a2['document'].item()

  """Entry point for launching an IPython kernel.


In [60]:
s2

''

In [63]:
test_df['document'].isnull().any()

False

In [69]:
import numpy as np
df = test_df.replace('', np.nan)

In [70]:
df.isnull().sum()

id            0
document    151
label         0
dtype: int64

In [71]:
df = df.dropna(how='any')
df.isnull().sum()

id          0
document    0
label       0
dtype: int64

In [102]:
print(d2.id)
print(d2.label)

tensor([9432243, 9281106, 3503793, 6890509, 5528346, 4629298, 5887056,  188817,
        9371277, 6495140, 9741170, 3126712, 7780993, 9314805, 8821940, 2105025,
        1691071, 6924329, 2240325, 6383599, 4643032, 9230572,  384233, 5568528,
        7208026, 8102061, 1600635, 4222450, 7579335, 1560439, 8994071, 7658045,
        2594701, 7074469, 2134828, 5446440, 4338906, 8868146, 9930234, 6997368,
        9904826, 3233787, 9791935, 8783145, 5155237, 9444900, 6129753, 7220072,
        4889225,  253713, 4214496, 8709469, 5944245, 1643613, 7448690, 7309008,
         681714, 5724567, 5914072, 7851504, 6396984, 6600712, 9165068, 6272271])
tensor([1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
        1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
        1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0])


In [15]:
test_df = pd.read_csv(os.path.join(path_base, 'test_pre.csv'))

In [16]:
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,,0
2,8544678,뭐야 이 평점들은 나쁘진 않지만 점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임 돈주고 보기에는,0
4,6723715,만 아니었어도 별 다섯 개 줬을텐데 왜 로 나와서 제 심기를 불편하게 하죠,0


In [18]:
test_df['document'] = test_df['document'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '')
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,,0
2,8544678,뭐야 이 평점들은 나쁘진 않지만 점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임 돈주고 보기에는,0
4,6723715,만 아니었어도 별 다섯 개 줬을텐데 왜 로 나와서 제 심기를 불편하게 하죠,0


In [19]:
test_df = test_df.dropna(how='any')
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
2,8544678,뭐야 이 평점들은 나쁘진 않지만 점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임 돈주고 보기에는,0
4,6723715,만 아니었어도 별 다섯 개 줬을텐데 왜 로 나와서 제 심기를 불편하게 하죠,0
5,7898805,음악이 주가 된 최고의 음악영화,1


In [21]:
test_df.to_csv(os.path.join(path_base, 'test_pre.csv'), index=False)


In [22]:
test_df2 = pd.read_csv(os.path.join(path_base, 'test_pre.csv'))
test_df2.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,8544678,뭐야 이 평점들은 나쁘진 않지만 점 짜리는 더더욱 아니잖아,0
2,6825595,지루하지는 않은데 완전 막장임 돈주고 보기에는,0
3,6723715,만 아니었어도 별 다섯 개 줬을텐데 왜 로 나와서 제 심기를 불편하게 하죠,0
4,7898805,음악이 주가 된 최고의 음악영화,1


In [99]:
d2 = next(itr_val)
print(d2)
print(d2.text)


[torchtext.data.batch.Batch of size 64]
	[.id]:[torch.LongTensor of size 64]
	[.text]:[torch.LongTensor of size 64x3]
	[.label]:[torch.LongTensor of size 64]
tensor([[4848, 6381,  688],
        [   0,  871,    3],
        [  25,   53,   43],
        [ 286,  162,   59],
        [  15,   48,    9],
        [ 107,  276,  107],
        [  85,   60, 1759],
        [ 214,  210, 1940],
        [  47,  167,  568],
        [  21,   90,   53],
        [2155, 2759, 3107],
        [   7,    9,  594],
        [   0,   27, 1731],
        [1587,   94,    2],
        [ 110, 1334,   43],
        [  55,   18,  266],
        [5750,   15, 1171],
        [ 110,   62,  276],
        [1561, 2832,   28],
        [ 120,  170,  268],
        [ 430,    5,   26],
        [  85,   23,   43],
        [ 315,  555,  137],
        [1323, 5117,  114],
        [ 120,  207,  123],
        [  47,  416, 1550],
        [2687,    5,   35],
        [ 120,   25,   23],
        [2891, 2891, 2891],
        [ 115,   90,   35],
 