In [12]:
from google.colab import drive
drive.mount("/content/drive")
import os
import sys
from datetime import datetime

drive_project_root = "/content/drive/MyDrive/FastCampus_DL_Study"
sys.path.append(drive_project_root)
!pip install -r "/content/drive/MyDrive/FastCampus_DL_Study/requirements.txt"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Tue Apr 26 13:12:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [14]:
# For data loading.
from typing import List
from typing import Dict
from typing import Union
from typing import Any
from typing import Optional
from typing import Iterable
from abc import abstractmethod
from abc import ABC
from datetime import datetime
from functools import partial
from collections import Counter
from collections import OrderedDict
import random
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pprint import pprint

from torchtext import data
from torchtext import datasets
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import Vocab, build_vocab_from_iterator, vocab
import spacy

# For configuration
from omegaconf import DictConfig
from omegaconf import OmegaConf
import hydra
from hydra.core.config_store import ConfigStore

# For logger
from torch.utils.tensorboard import SummaryWriter
import wandb
os.environ["WANDB_START_METHOD"]="thread"

In [15]:
from data_utils import dataset_split
from config_utils import flatten_dict
from config_utils import register_config
from config_utils import configure_optimizers_from_cfg
from config_utils import get_loggers
from config_utils import get_callbacks
from custom_math import softmax

In [16]:
# download eng/d data.
!python -m spacy download en
!python -m spacy download en_core_web_sm
!python -m spacy download de
!python -m spacy download de_core_news_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 1.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 15.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.

In [17]:
# practice data first go to dataconfig

# data configs
data_spacy_de_en_cfg = {
    "name": "spacy_de_en",
    "data_root": os.path.join(os.getcwd(), "data"),
    "tokenizer": "spacy",
    "src_lang": "de",
    "tgt_lang": "en",
    "src_index": 0,
    "tgt_index": 1,
    "vocab": {
        "special_symbol2index": {
            # Define special symbols and indices
            # UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
            # Make sure the tokens are in order of their indices to properly insert them in vocab
            '<unk>': 0, # unknown
            '<pad>': 1, # 패딩
            '<bos>': 2, # 문장 시작 지점
            '<eos>': 3, # 문장 끝 지점
        },
        "special_first": True,
        "min_freq": 2
    }
}

data_cfg = OmegaConf.create(data_spacy_de_en_cfg)
print(OmegaConf.to_yaml(data_cfg))

name: spacy_de_en
data_root: /content/data
tokenizer: spacy
src_lang: de
tgt_lang: en
src_index: 0
tgt_index: 1
vocab:
  special_symbol2index:
    <unk>: 0
    <pad>: 1
    <bos>: 2
    <eos>: 3
  special_first: true
  min_freq: 2



In [18]:
# get dataset
# data_root = os.path.join(os.getcwd(), "data")

train_data, valid_data, test_data = Multi30k(data_cfg.data_root)

test_data = to_map_style_dataset(test_data) # 데이터 너무 많으면 깨질 수 있음, 원래는 예외처리 필요

In [19]:
for i in test_data:
  print(i)
  break

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.\n', 'A man in an orange hat starring at something.\n')


In [20]:
# 1. token transform
def get_token_transform(data_cfg: DictConfig) -> dict:
  """
    - token transform
    - 주어진 문장에 대한 토큰화 시행 함수
  """
  token_transform: dict = {}

  # src: 독일어
  token_transform[data_cfg.src_lang] = get_tokenizer(
      data_cfg.tokenizer, language=data_cfg.src_lang
  )

  # trg: 영어
  token_transform[data_cfg.tgt_lang] = get_tokenizer(
      data_cfg.tokenizer, language=data_cfg.tgt_lang
  )

  return token_transform

token_transform = get_token_transform(data_cfg)

In [21]:
# 2. vocab transformation
def yield_tokens(
    data_iter: Iterable, lang: str, lang2index: Dict[str, int]
) -> List[str]:
  """
    - help function to yield list of tokens
  """
  for data_sample in data_iter:
    yield token_transform[lang](data_sample[lang2index[lang]])

def get_vocab_transform(data_cfg: DictConfig) -> dict:
  vocab_transform: dict = {}

  # 독어, 영어 각각에 대한 단어 객체 생성
  for ln in [data_cfg.src_lang, data_cfg.tgt_lang]:
    # training data iterator
    train_iter = Multi30k(
        split='train', language_pair=(data_cfg.src_lang, data_cfg.tgt_lang)
    )
    # torchtext vocab 객체 독어, 영어로 각각 생성
    vocab_transform[ln] = build_vocab_from_iterator(
        # must yield list or iterator of tokens
        iterator=yield_tokens(
            train_iter,
            ln,
            {
                data_cfg.src_lang: data_cfg.src_index, # str: int
                data_cfg.tgt_lang: data_cfg.tgt_index # str: int
            }
        ),
        min_freq=data_cfg.vocab.min_freq,
        specials=list(data_cfg.vocab.special_symbol2index.keys()),
        special_first=data_cfg.vocab.special_first
    )

  # Set UNK_IDX as the default index. This index is returned when the token is not found.
  # If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
  for ln in [data_cfg.src_lang, data_cfg.tgt_lang]:
      vocab_transform[ln].set_default_index(
          data_cfg.vocab.special_symbol2index["<unk>"]
      )
  return vocab_transform

vocab_transform = get_vocab_transform(data_cfg)

In [22]:
print(vocab_transform["de"]["<unk>"])
print(vocab_transform["en"]["<unk>"])
print(vocab_transform["en"]["hello"], vocab_transform["en"]["world"])

0
0
5466 1871


In [25]:
# 3. integrated transform
# text transformation: [token_transformation --> vocab_transformation --> torch.tensor transformation]

# helper function for collate func
# 
def sequential_transforms(*transforms):
  def func(txt_input):
    for transform in transforms:
      txt_input = transform(txt_input)
    return txt_input
  return func

# function to add BOS/EOS and create tensor for input sequence indices
# BOS: 문장의 시작, EOS: 문장의 끝
def tensor_transform(token_ids: List[int], bos_index: int, eos_index: int):
  return torch.cat(
      (
          torch.tensor([bos_index]),
          torch.tensor(token_ids),
          torch.tensor([eos_index])
      )
  )

# src and tgt language text transforms to convert raw strings into tensors indices
def get_text_transform(data_cfg: DictConfig):
  text_transform: dict = {}
  for ln in [data_cfg.src_lang, data_cfg.tgt_lang]:
    text_transform[ln] = sequential_transforms(
        token_transform[ln],
        vocab_transform[ln],
        partial(
            tensor_transform,
            bos_index=data_cfg.vocab.special_symbol2index['<bos>'],
            eos_index=data_cfg.vocab.special_symbol2index['<eos>']
        )
    ) # add BOS/EOS and create tensor
  return text_transform

text_transform = get_text_transform(data_cfg)

In [28]:
print(text_transform["en"]("hello"))
print(text_transform["en"]("hello,"))
print(text_transform["en"]("hello, how"))
print(text_transform["en"]("hello, how are you ?"))

tensor([   2, 5466,    3])
tensor([   2, 5466,   16,    3])
tensor([   2, 5466,   16,  890,    3])
tensor([   2, 5466,   16,  890,   18, 1329, 2471,    3])


In [35]:
# 4. collate func ==> batch 전처리
def collate_fn(batch, data_cfg: DictConfig):
  src_batch, tgt_batch = [], []

  # 맨 뒤에 \n 제거
  for src_sample, tgt_sample in batch:
    src_batch.append(text_transform[data_cfg.src_lang](src_sample.rstrip('\n')))
    tgt_batch.append(text_transform[data_cfg.tgt_lang](tgt_sample.rstrip('\n')))

  # padding 1로 채움
  src_batch = pad_sequence(src_batch, padding_value=data_cfg.vocab.special_symbol2index['<pad>'])
  tgt_batch = pad_sequence(tgt_batch, padding_value=data_cfg.vocab.special_symbol2index['<pad>'])

  return src_batch, tgt_batch

def get_collate_fn(cfg: DictConfig):
  return partial(collate_fn, data_cfg=cfg.data)

# 5. data loader 정의
def get_multi30k_dataloader(
    split_mode: str, language_pair, batch_size: int, collate_fn
):
  iter = Multi30k(split=split_mode, language_pair=language_pair)
  dataset = to_map_style_dataset(iter)
  dataloader = torch.utils.data.DataLoader(
      dataset, batch_size=batch_size, collate_fn=collate_fn
  )

  return dataloader

test_dataloader = get_multi30k_dataloader(
    split_mode='test', 
    language_pair=(data_cfg.src_lang, data_cfg.tgt_lang), 
    batch_size=3, 
    collate_fn=partial(collate_fn, data_cfg=data_cfg)
  )

In [37]:
for i in test_dataloader:
  print(i)
  break

(tensor([[   2,    2,    2],
        [   6,    6,    6],
        [  13, 3690,   28],
        [  11, 4018,    8],
        [   7,   87,    7],
        [ 179,   44, 2874],
        [ 109,    0, 3030],
        [   9,  121,   21],
        [  17,   29,  296],
        [  79,    7,   11],
        [   0,   53,    7],
        [   5,  328, 4946],
        [   3,    5,    5],
        [   1,    3,    3]]), tensor([[   2,    2,    2],
        [   7,    7,    7],
        [  13, 3375,   34],
        [   8, 4933,    8],
        [  29,   11,  873],
        [  92,   83,  235],
        [  69,   10, 3454],
        [2671, 2603,    4],
        [  21,   52,  344],
        [ 123,  102,   15],
        [   6,    8,    4],
        [   3,   45,   45],
        [   1,   14,  841],
        [   1,    4,    6],
        [   1,   25,    3],
        [   1,  275,    1],
        [   1,    6,    1],
        [   1,    3,    1]]))
