In [1]:
import os
import sys
sys.path.append("/home/pervinco/DL-workspace/NLP/VocabPrediction")

import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from utils.util import read_file
from data.datasets import download_wikitext

In [2]:
data_dir = "/home/pervinco/Datasets/wikitext"
batch_size = 128
seq_len = 50

In [3]:
## Download Dataset
download_wikitext(data_dir)

## Define Dataset Dir
data_dir = f"{data_dir}/wikitext-2"
train_file = f"{data_dir}/wiki.train.tokens"
valid_file = f"{data_dir}/wiki.valid.tokens"
test_file = f"{data_dir}/wiki.test.tokens"

## Read File
train_text = read_file(train_file)
valid_text = read_file(valid_file)
test_text = read_file(test_file)

Dataset already exist.


In [4]:
for data in train_text[:5]:
    print(data)

 

 = Valkyria Chronicles III = 

 

 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 

 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more <unk> for series newcomers . 

In [5]:
def tokenize_file(text):
    """
    데이터셋에 포함된 각각의 문장들을 토큰화(문장을 단어 단위로 분리)하며, 마지막에 <eos> 토큰을 추가한다.
    """
    tokenized_data = []
    for line in text:
        tokens = tokenizer(line.strip()) + ['<eos>']
        tokenized_data.append(tokens)
    return tokenized_data

## Tokenize
tokenizer = get_tokenizer('basic_english')
train_data_tokens = tokenize_file(train_text)
valid_data_tokens = tokenize_file(valid_text)
test_data_tokens = tokenize_file(test_text)

In [6]:
for token_data in train_data_tokens[:5]:
    print(token_data)

['<eos>']
['=', 'valkyria', 'chronicles', 'iii', '=', '<eos>']
['<eos>']
['senjō', 'no', 'valkyria', '3', '<unk>', 'chronicles', '(', 'japanese', '戦場のヴァルキュリア3', ',', 'lit', '.', 'valkyria', 'of', 'the', 'battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'valkyria', 'chronicles', 'iii', 'outside', 'japan', ',', 'is', 'a', 'tactical', 'role', '@-@', 'playing', 'video', 'game', 'developed', 'by', 'sega', 'and', 'media', '.', 'vision', 'for', 'the', 'playstation', 'portable', '.', 'released', 'in', 'january', '2011', 'in', 'japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'valkyria', 'series', '.', '<unk>', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', '@-@', 'time', 'gameplay', 'as', 'its', 'predecessors', ',', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', 'nameless', ',', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia', 'during', 'the', 'second', 'europan', 'war', 'who', 'per

In [7]:
def build_vocab(data_tokens):
    """
    토큰화된 데이터셋을 이용해 단어 사전을 생성한다.
    """
    return build_vocab_from_iterator(data_tokens, specials=['<unk>', '<eos>'], min_freq=3)

## 단어집 생성.
vocab = build_vocab(train_data_tokens + valid_data_tokens + test_data_tokens)
vocab.set_default_index(vocab['<unk>'])
vocab_size = len(vocab)
print(vocab_size) ## 28783

28783


In [8]:
def get_data(tokenized_data, vocab, batch_size):
    """
    토큰들이 Vocab내 index로 mapping하고 리스트에 저장.
    결과적으로 전체 문장들이 하나의 문장으로 모두 연결된다.

    데이터의 총 원소 수(numel())를 batch_size로 나누어 전체 데이터를 몇 개의 배치로 나눌 수 있는지 계산한다. 
    이를 통해, 데이터의 길이를 배치 크기에 맞게 조정합니다. 즉, 모든 배치의 길이는 통일합니다.

    ex) 전체 데이터의 수가 2086708일 때, batch_size=128 ---> 2086708 // 128 = 16302 따라서 num_batches = 16302
    """
    data = []
    for tokens in tokenized_data:
        token_indices = [vocab[token] for token in tokens]
        data.extend(token_indices)

    data = torch.LongTensor(data) ## [2086708]
    num_batches = data.numel() // batch_size ## 16302
    data = data[:num_batches * batch_size] ## 2086656. 남은 52개의 단어는 제외하여 모든 데이터들의 길이를 통일시킨다.
    data = data.view(batch_size, -1) ## [batch_size, 16302]로 reshape
    
    return data

# 전체 토큰의 수, 전체 batch 단위의 수
train_data = get_data(train_data_tokens, vocab, batch_size) ## data.numel() : 2086708, num_batches : 16302, data : 218177
valid_data = get_data(valid_data_tokens, vocab, batch_size) 
test_data = get_data(test_data_tokens, vocab, batch_size)

In [9]:
"""
전체 문장을 토큰화하고, 맵핑한 길이는 2086708
batch_size 크기로 나누면 16302의 batch가 만들어질 수 있다. [batch_size, num_batches]로 변환함으로 인해 세로축이 하나의 문장이 된다.
"""
print(train_data.shape) ## 128개의 단어가 담긴 텐서가 16302개 있다. We have 16302 batches, each of 128 words

sample = train_data[0, :].numpy()
print(sample.shape)
sample_str = ""

for s in sample:
    char = vocab.lookup_token(s)
    print(char)
    sample_str += f"{char} "

print(sample_str)

torch.Size([128, 16302])
(16302,)
<eos>
=
valkyria
chronicles
iii
=
<eos>
<eos>
senjō
no
valkyria
3
<unk>
chronicles
(
japanese
戦場のヴァルキュリア3
,
lit
.
valkyria
of
the
battlefield
3
)
,
commonly
referred
to
as
valkyria
chronicles
iii
outside
japan
,
is
a
tactical
role
@-@
playing
video
game
developed
by
sega
and
media
.
vision
for
the
playstation
portable
.
released
in
january
2011
in
japan
,
it
is
the
third
game
in
the
valkyria
series
.
<unk>
the
same
fusion
of
tactical
and
real
@-@
time
gameplay
as
its
predecessors
,
the
story
runs
parallel
to
the
first
game
and
follows
the
nameless
,
a
penal
military
unit
serving
the
nation
of
gallia
during
the
second
europan
war
who
perform
secret
black
operations
and
are
pitted
against
the
imperial
unit
<unk>
raven
.
<eos>
the
game
began
development
in
2010
,
carrying
over
a
large
portion
of
the
work
done
on
valkyria
chronicles
ii
.
while
it
retained
the
standard
features
of
the
series
,
it
also
underwent
multiple
adjustments
,
such
as
making
the
game

In [10]:
"""
data 텐서의 column이 각각의 문장에 해당한다.
따라서 row단위로 indexing을 하는 것은 각각의 문장에서 단어를 가져오는 형태가 된다.

"""

def get_batch(data, seq_len, idx):
    src = data[:, idx:idx+seq_len] ## 각 문장의 첫번째 글자                   
    target = data[:, idx+1:idx+seq_len+1]
    return src, target

In [14]:
data = train_data
num_batches = data.shape[-1] ## 16302
data = data[:, :num_batches - (num_batches -1) % seq_len] ## 16302 - (16302 - 1) % 50, [128, 16301]
num_batches = data.shape[-1] ## 16301

for idx in range(0, num_batches - 1, seq_len): ## 0 ~ 16300까지 step = 50
    src, target = get_batch(data, seq_len, idx)
    print(src.shape, target.shape)

    break

torch.Size([128, 50]) torch.Size([128, 50])


In [None]:
(16302 - 1) % 50