In [1]:
import os
import math
import zipfile
import requests

import torch
import torch.nn as nn

from tqdm import tqdm
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
download_folder = "/home/pervinco/Datasets/wikitext"
save_folder = "/home/pervinco/Models/Wikit-Text"

batch_size = 1
num_epochs = 100
learning_rate = 0.01

n_epochs = 50
seq_len = 50
clip = 0.25
saved = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
if not os.path.isdir(save_folder):
    os.makedirs(save_folder)

## Download WikiText Dataset

In [4]:
def download_wikitext(folder_dir):
    wikitext2_url = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip?ref=blog.salesforceairesearch.com"
    wikitext103_url = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip?ref=blog.salesforceairesearch.com"

    def download_file(url, filename):
        response = requests.get(url)
        with open(filename, 'wb') as file:
            file.write(response.content)

    def unzip_file(zip_filename, extract_to):
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall(extract_to)

    if not os.path.isdir(folder_dir) and not os.path.isfile(f"{folder_dir}/wikitext-2/wiki.train.tokens"):
        print("Download Dataset.")
        os.makedirs(folder_dir)

        wikitext2_zip = os.path.join(download_folder, "wikitext-2-v1.zip")
        wikitext103_zip = os.path.join(download_folder, "wikitext-103-v1.zip")

        download_file(wikitext2_url, wikitext2_zip)
        unzip_file(wikitext2_zip, download_folder)

        download_file(wikitext103_url, wikitext103_zip)
        unzip_file(wikitext103_zip, download_folder)
        print("Done")

    else:
        print("Dataset already exist.")

In [5]:
download_wikitext(download_folder)

Dataset already exist.


## Read File

In [6]:
data_dir = f"{download_folder}/wikitext-2"

train_file = f"{data_dir}/wiki.train.tokens"
valid_file = f"{data_dir}/wiki.valid.tokens"
test_file = f"{data_dir}/wiki.test.tokens"

In [7]:
def read_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        text = f.readlines()

    return text

In [8]:
train_text = read_file(train_file)
valid_text = read_file(valid_file)
test_text = read_file(test_file)

In [9]:
print(train_text[88])

 This ammunition , and that which I brought with me , was rapidly prepared for use at the Laboratory established at the Little Rock Arsenal for that purpose . As illustrating as the <unk> <unk> of material in the country , the fact may be stated that it was found necessary to use public documents of the State Library for cartridge paper . <unk> were employed or conscripted , tools purchased or impressed , and the repair of the damaged guns I brought with me and about an equal number found at Little Rock commenced at once . But , after inspecting the work and observing the spirit of the men I decided that a garrison 500 strong could hold out against Fitch and that I would lead the remainder - about 1500 - to <unk> 'l <unk> as soon as shotguns and rifles could be obtained from Little Rock instead of <unk> and lances , with which most of them were armed . Two days <unk> before the change could be effected . " 



## Tokenize

In [10]:
## 데이터셋에 포함된 각각의 문장들을 토큰화(문장을 단어 단위로 분리)하며, 마지막에 <eos> 토큰을 추가한다.
tokenizer = get_tokenizer('basic_english')
def tokenize_file(text):
    tokenized_data = []
    for line in text:
        tokens = tokenizer(line.strip()) + ['<eos>']
        tokenized_data.append(tokens)
    return tokenized_data

train_data_tokens = tokenize_file(train_text)
valid_data_tokens = tokenize_file(valid_text)
test_data_tokens = tokenize_file(test_text)

print(train_data_tokens[88])

['this', 'ammunition', ',', 'and', 'that', 'which', 'i', 'brought', 'with', 'me', ',', 'was', 'rapidly', 'prepared', 'for', 'use', 'at', 'the', 'laboratory', 'established', 'at', 'the', 'little', 'rock', 'arsenal', 'for', 'that', 'purpose', '.', 'as', 'illustrating', 'as', 'the', '<unk>', '<unk>', 'of', 'material', 'in', 'the', 'country', ',', 'the', 'fact', 'may', 'be', 'stated', 'that', 'it', 'was', 'found', 'necessary', 'to', 'use', 'public', 'documents', 'of', 'the', 'state', 'library', 'for', 'cartridge', 'paper', '.', '<unk>', 'were', 'employed', 'or', 'conscripted', ',', 'tools', 'purchased', 'or', 'impressed', ',', 'and', 'the', 'repair', 'of', 'the', 'damaged', 'guns', 'i', 'brought', 'with', 'me', 'and', 'about', 'an', 'equal', 'number', 'found', 'at', 'little', 'rock', 'commenced', 'at', 'once', '.', 'but', ',', 'after', 'inspecting', 'the', 'work', 'and', 'observing', 'the', 'spirit', 'of', 'the', 'men', 'i', 'decided', 'that', 'a', 'garrison', '500', 'strong', 'could', 'ho

In [11]:
## 토큰화된 데이터셋을 이용해 단어 사전을 생성한다.
def build_vocab(data_tokens):
    return build_vocab_from_iterator(data_tokens, specials=['<unk>', '<eos>'], min_freq=3)

vocab = build_vocab(train_data_tokens + valid_data_tokens + test_data_tokens)
vocab.set_default_index(vocab['<unk>'])

print(len(vocab))
print(vocab.get_itos()[:10])

28783
['<unk>', '<eos>', 'the', ',', '.', 'of', 'and', 'in', 'to', 'a']


In [12]:
def get_data(tokenized_data, vocab, batch_size):
    data = []
    for tokens in tokenized_data:
        token_indices = [vocab[token] for token in tokens]
        data.extend(token_indices)
    data = torch.LongTensor(data)
    num_batches = data.numel() // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, -1)
    return data

batch_size = 128
train_data = get_data(train_data_tokens, vocab, batch_size) ## data.numel() : 2086708, num_batches : 16302, data : 218177
valid_data = get_data(valid_data_tokens, vocab, batch_size) 
test_data = get_data(test_data_tokens, vocab, batch_size)

In [13]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)

        if tie_weights:
            assert embedding_dim == hidden_dim, 'If tying weights then embedding_dim must equal hidden_dim'
            self.embedding.weight = self.fc.weight
        self.init_weights()

    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))
        output, hidden = self.rnn(embedding, hidden)
        output = self.dropout(output)
        prediction = self.fc(output)
        return prediction, hidden

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.rnn.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 
            self.rnn.all_weights[i][1] = torch.FloatTensor(self.hidden_dim, 
                    self.hidden_dim).uniform_(-init_range_other, init_range_other)

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden

    # We don't learn the hidden state so we can detach it from the computation graph
    def detach_hidden(self, hidden):
        hidden = hidden.detach()
        return hidden


In [14]:
vocab_size = len(vocab)
embedding_dim = 1024
hidden_dim = 1024
num_layers = 2
dropout_rate = 0.65              
tie_weights = True                  
lr = 1e-3

In [15]:
model = RNN(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 33,700,975 trainable parameters


In [16]:
def get_batch(data, seq_len, num_batches, idx):
    src = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]             # The target is the src shifted by one batch
    return src, target

In [17]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):  # The last batch can't be a src
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, num_batches, idx)
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)                 # model output

        prediction = prediction.reshape(batch_size * seq_len, -1)   
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [18]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, num_batches, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [19]:
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

if saved:
    model.load_state_dict(torch.load(f'{save_folder}/best.pt',  map_location=device))
    test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
    print(f'Test Perplexity: {math.exp(test_loss):.3f}')
else:
    best_valid_loss = float('inf')

    for epoch in range(n_epochs):
        train_loss = train(model, train_data, optimizer, criterion, batch_size, seq_len, clip, device)
        valid_loss = evaluate(model, valid_data, criterion, batch_size, seq_len, device)
        
        lr_scheduler.step(valid_loss)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f'{save_folder}/best.pt')

        print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
        print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

	Train Perplexity: 1583.341
	Valid Perplexity: 999.094


                                                           

	Train Perplexity: 1528.389
	Valid Perplexity: 1038.771


                                                           

	Train Perplexity: 1444.870
	Valid Perplexity: 969.895


                                                           

	Train Perplexity: 1416.791
	Valid Perplexity: 967.056


                                                           

	Train Perplexity: 1400.655
	Valid Perplexity: 961.473


                                                           

	Train Perplexity: 1385.607
	Valid Perplexity: 955.227


                                                           

	Train Perplexity: 1371.803
	Valid Perplexity: 955.130


                                                           

	Train Perplexity: 1319.617
	Valid Perplexity: 897.961


                                                           

	Train Perplexity: 1300.886
	Valid Perplexity: 894.876


                                                           

	Train Perplexity: 1288.621
	Valid Perplexity: 891.613


                                                           

	Train Perplexity: 1280.950
	Valid Perplexity: 892.703


                                                           

	Train Perplexity: 1256.983
	Valid Perplexity: 861.054


                                                           

	Train Perplexity: 1242.104
	Valid Perplexity: 859.829


                                                           

	Train Perplexity: 1233.416
	Valid Perplexity: 856.768


                                                           

	Train Perplexity: 1223.515
	Valid Perplexity: 853.330


                                                           

	Train Perplexity: 1164.255
	Valid Perplexity: 748.053


                                                           

	Train Perplexity: 892.080
	Valid Perplexity: 545.562


                                                           

	Train Perplexity: 685.837
	Valid Perplexity: 450.191


                                                           

	Train Perplexity: 578.492
	Valid Perplexity: 395.802


                                                           

	Train Perplexity: 505.119
	Valid Perplexity: 353.790


                                                           

	Train Perplexity: 451.620
	Valid Perplexity: 324.502


                                                           

	Train Perplexity: 410.847
	Valid Perplexity: 303.102


                                                           

	Train Perplexity: 377.319
	Valid Perplexity: 283.471


                                                           

	Train Perplexity: 348.825
	Valid Perplexity: 268.780


                                                           

	Train Perplexity: 325.091
	Valid Perplexity: 252.083


                                                           

	Train Perplexity: 304.497
	Valid Perplexity: 241.909


                                                           

	Train Perplexity: 286.438
	Valid Perplexity: 230.796


                                                           

	Train Perplexity: 271.973
	Valid Perplexity: 223.234


                                                           

	Train Perplexity: 259.396
	Valid Perplexity: 216.808


                                                           

	Train Perplexity: 248.490
	Valid Perplexity: 210.276


                                                           

	Train Perplexity: 238.444
	Valid Perplexity: 206.806


                                                           

	Train Perplexity: 229.572
	Valid Perplexity: 199.854


                                                           

	Train Perplexity: 221.671
	Valid Perplexity: 195.606


                                                           

	Train Perplexity: 213.954
	Valid Perplexity: 189.946


                                                           

	Train Perplexity: 207.312
	Valid Perplexity: 188.432


                                                           

	Train Perplexity: 201.064
	Valid Perplexity: 184.011


                                                           

	Train Perplexity: 195.015
	Valid Perplexity: 180.722


                                                           

	Train Perplexity: 189.734
	Valid Perplexity: 178.406


                                                           

	Train Perplexity: 184.917
	Valid Perplexity: 175.036


                                                           

	Train Perplexity: 180.373
	Valid Perplexity: 172.380


                                                           

	Train Perplexity: 176.056
	Valid Perplexity: 169.446


                                                           

	Train Perplexity: 171.931
	Valid Perplexity: 170.324


                                                           

	Train Perplexity: 165.732
	Valid Perplexity: 165.028


                                                           

	Train Perplexity: 163.636
	Valid Perplexity: 164.803


                                                           

	Train Perplexity: 161.376
	Valid Perplexity: 164.062


                                                           

	Train Perplexity: 159.630
	Valid Perplexity: 162.407


                                                           

	Train Perplexity: 158.016
	Valid Perplexity: 161.786


                                                           

	Train Perplexity: 156.218
	Valid Perplexity: 161.440


                                                           

	Train Perplexity: 154.624
	Valid Perplexity: 160.332


                                                           

	Train Perplexity: 153.231
	Valid Perplexity: 160.164


In [20]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  # temperature is unfair
            prediction = torch.multinomial(probs, num_samples=1).item()     # take one sample from the distribution
            
            while prediction == vocab['<unk>']:
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:
                break

            indices.append(prediction)

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [21]:
prompt = 'Think about'
max_seq_len = 30
seed = 0

# convert the code above into a for loop
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
think about a few of each other . we is a very lot of the whole of the season , but it is not only . it is a good to the

0.7
think about his free , . the story is the series of a person , as the third technology of the lifestream , which is a only sign of the way ,

0.75
think about his free , . the story is the series of a person , as the third technology of the lifestream , which is a only sign that the bones are

0.8
think about his free , . the story is the series of a person , as the third technology of the lifestream , which is a only sign that the bones are

1.0
think about his free drivers .

