In [12]:
# download data
import requests, os
import pandas as pd



data_url = "https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv"
path_root = os.path.expanduser('~/dataset/IMDB_review')
path_train = os.path.join(path_root, 'train.csv')
path_test = os.path.join(path_root, 'test.csv')
if not os.path.isdir(path_root):
    os.mkdir(path_root)
with open(os.path.expanduser('~/dataset/IMDB_review/IMDB_reviews.csv'), 'wb') as f:
    f.write(requests.get(data_url).content)

In [6]:
df = pd.read_csv(os.path.join(path_root, 'IMDB_reviews.csv'), encoding='latin1')
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [7]:
# split data into train, test set
print(len(df))
train_df = df[:45000]
test_df = df[45000:]

train_df.to_csv(path_train, index=False)
test_df.to_csv(path_train, index=False)

50000


In [11]:
# define Field
from torchtext import data


# str.split('abc abcd adf', sep=' ')
"""
sequential : 시퀀스 데이터 여부. (True가 기본값)
use_vocab : 단어 집합을 만들 것인지 여부. (True가 기본값)
tokenize : 어떤 토큰화 함수를 사용할 것인지 지정. (string.split이 기본값)
lower : 영어 데이터를 전부 소문자화한다. (False가 기본값)
batch_first : 미니 배치 차원을 맨 앞으로 하여 데이터를 불러올 것인지 여부. (False가 기본값)
is_target : 레이블 데이터 여부. (False가 기본값)
fix_length : 최대 허용 길이. 이 길이에 맞춰서 패딩 작업(Padding)을 진행.

- If you are passing a field that is already numericalized by default and is not sequential, 
you should pass use_vocab=False and sequential=False.
- 정리: 특정 열이 
sequence로 되어 있고 idx로의 mapping 필요 => sequential=True, use_vocab=True
sequence가 아니고 idx로의 mapping은 필요 => sequential=False, use_vocab=True, 나중에 build_vocab
sequence가 아니고 idx mapping도 필요 없음(이미 정수로 매핑되어 있음) => sequential=False, use_vocab=False
"""
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  batch_first=True,
                  fix_length=20)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

In [14]:
# make Dataset doing tokenizing
from torchtext.data import TabularDataset


train_data, test_data = TabularDataset.splits(path=path_root, train='train.csv', test='test.csv', format='csv', 
                                             fields=[('text', TEXT), ('label', LABEL)], skip_header=True)

In [26]:
len(train_data), len(test_data)

(45000, 5000)

In [27]:
vars(train_data[0])
# train_data[0].__dict__

{'text': ['my',
  'family',
  'and',
  'i',
  'normally',
  'do',
  'not',
  'watch',
  'local',
  'movies',
  'for',
  'the',
  'simple',
  'reason',
  'that',
  'they',
  'are',
  'poorly',
  'made,',
  'they',
  'lack',
  'the',
  'depth,',
  'and',
  'just',
  'not',
  'worth',
  'our',
  'time.<br',
  '/><br',
  '/>the',
  'trailer',
  'of',
  '"nasaan',
  'ka',
  'man"',
  'caught',
  'my',
  'attention,',
  'my',
  'daughter',
  'in',
  "law's",
  'and',
  "daughter's",
  'so',
  'we',
  'took',
  'time',
  'out',
  'to',
  'watch',
  'it',
  'this',
  'afternoon.',
  'the',
  'movie',
  'exceeded',
  'our',
  'expectations.',
  'the',
  'cinematography',
  'was',
  'very',
  'good,',
  'the',
  'story',
  'beautiful',
  'and',
  'the',
  'acting',
  'awesome.',
  'jericho',
  'rosales',
  'was',
  'really',
  'very',
  'good,',
  "so's",
  'claudine',
  'barretto.',
  'the',
  'fact',
  'that',
  'i',
  'despised',
  'diether',
  'ocampo',
  'proves',
  'he',
  'was',
  'effect

In [30]:
# make Vocabulary(set of words)
TEXT.build_vocab(train_data, min_freq=10, max_size=10000)
print(len(TEXT.vocab))

10002


In [31]:
TEXT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fc1b3ea3860>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             'a': 3,
             'and': 4,
             'of': 5,
             'to': 6,
             'is': 7,
             'in': 8,
             'i': 9,
             'this': 10,
             'that': 11,
             'it': 12,
             '/><br': 13,
             'was': 14,
             'as': 15,
             'for': 16,
             'with': 17,
             'but': 18,
             'on': 19,
             'movie': 20,
             'his': 21,
             'are': 22,
             'not': 23,
             'you': 24,
             'film': 25,
             'have': 26,
             'he': 27,
             'be': 28,
             'at': 29,
             'one': 30,
             'by': 31,
             'an': 32,
             'they': 33,
             'from': 34,
             'all': 35,
             'who': 36,
             'like

In [32]:
# Make Dataloader that helps to load mini-batch
from torchtext.data import Iterator

batch_size = 5
train_loader = Iterator(dataset=train_data, batch_size=batch_size)
test_loader = Iterator(dataset=test_data, batch_size=batch_size)
print(len(train_loader))
print(len(test_loader))

9000
1000


In [33]:
train_iter = iter(train_loader)
batch = next(train_iter)
print(type(batch))  # It is not torch.Tensor.
print(batch)

<class 'torchtext.data.batch.Batch'>

[torchtext.data.batch.Batch of size 5]
	[.text]:[torch.LongTensor of size 5x20]
	[.label]:[torch.LongTensor of size 5]


In [35]:
print(batch.text)
print(batch.text.size())
print(batch.label)

tensor([[  54,    8,    3,   97,  150,   51,    7,  585,   46, 1640, 8667,  141,
           11,   24,   64,  140, 1656,   12,   14, 2367],
        [  10,   25,  175,    6,   28,    3, 2910,  136,    5,   99,  124,   11,
           26, 3265,   17,    2,  164,  871,    8,    2],
        [   0, 1427,    0,  637,   11,  175,    6,   28,    0,    8,    3,    0,
            3,    0, 1884,   36,   64,  180,   12,    4],
        [  10,   25, 3727,    3,  168,    5,    0,   76,   12, 2721,   19, 1450,
            4,  122, 4032,    2,  130,    7,   50,  666],
        [  10, 2525,   65,    7,    0,    0,    8,    2,  966,    0,    7,  687,
           30,    5,    2,  223,  545,  124,  189,  120]])
torch.Size([5, 20])
tensor([0, 0, 0, 1, 0])
