<a href="https://colab.research.google.com/github/newfull5/AI-Project/blob/master/nlp_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb transformers datasets torch tqdm

In [None]:
import argparse

def _get_parser():  
    parser = argparse.ArgumentParser()
    parser.add_argument()
    return parser
   

args = argparse.Namespace(  
  model_name="klue/bert-base", 
  tokenizer_name="klue/bert-base",
  dataset_name="nsmc",
  save_dir_path="./",
  val_check_step=50,
  batch_size=4,
  max_epochs=3,
  patient=3, 
  lr=3e-5
)

In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

class NsmcDataset(Dataset):
  def __init__(self, args, stage):
    super().__init__()
    self.stage = stage
    self.args = args
    self.save_dir = args.save_dir_path
    self.documents, self.labels = self._get_data(args, stage)

  def _get_data(self, args, stage):
    dataset = load_dataset(args.dataset_name)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)

    if stage == 'train':
      documents = dataset['train']['document']
      labels = dataset['train']['label']
    elif stage == 'valid':
      documents = dataset['test']['document'][:25000]
      labels = dataset['test']['label'][:25000]
    elif stage == 'test':
      documents = dataset['test']['document'][25000:]
      labels = dataset['test']['label'][25000:]
    else:
      raise Exception("you can set stage only 'train', 'test' or 'valid'")

    return [self._preproc(tokenizer, text) for text in documents], labels

  def _preproc(self, tokenizer, text):
    tokenized = tokenizer(
      text=text,
      padding='max_length',
      truncation=True,
      return_tensors='pt'
    )
    return tokenized
      
  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.documents[idx], self.labels[idx]

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    dataset = NsmcDataset(args, 'train'),
    batch_size = args.batch_size,
    shuffle = True
)

valid_loader = DataLoader(
    dataset = NsmcDataset(args, 'valid'),
    batch_size = args.batch_size,
    shuffle = True
)

test_loader = DataLoader(
    dataset = NsmcDataset(args, 'test'),
    batch_size = args.batch_size,
    shuffle = True
)

In [None]:
from transformers import AutoModelForSequenceClassification
import torch
from torch import nn

class Model(nn.Module):
  def __init__(self, args):
    super(Model, self).__init__()
    self.model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=2)
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.save_dir = args.save_dir_path
    self.criterion = nn.CrossEntropyLoss()
    self.model.to(self.device)

  def forward(self, batch):
    inputs, labels = batch
    inputs['input_ids'] = self._move_to_cuda(inputs['input_ids'].squeeze())
    inputs['attention_mask'] = self._move_to_cuda(inputs['attention_mask'])
    labels = self._move_to_cuda(labels)

    outputs = self.model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
    )
    loss = self.criterion(outputs.logits, labels)

    return outputs.logits, loss

  def _move_to_cuda(self, inputs):
    if torch.is_tensor(inputs):
      return inputs.to(self.device)
    elif isinstance(inputs, list):
      return [self._move_to_cuda(x) for x in inputs]
    elif isinstance(inputs, dict):
      return {key: self._move_to_cuda(value) for key, value in inputs.items()}
    else:
      return inputs

  def save(self):
    self.model.save_pretrained(self.save_dir)

  def load(self):
    self.model.load_state_dict(
        torch.load(self.save_dir + 'pytorch_model.bin', map_location=torch.device(self.device))
    )

In [None]:
from tqdm import tqdm
import wandb

class Trainer:
  def __init__(self, args, model, train_loader, valid_loader):
    self.model = model
    self.train_loader = train_loader
    self.valid_loader = valid_loader
    self.max_epochs = args.max_epochs
    self.val_check_step = args.val_check_step
    self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=args.lr)
    self.epoch = 0

  def train_step(self):
    self.model.train()
    total_train_loss = 0
    train_steps = 0
    for step, batch in tqdm(enumerate(self.train_loader), desc=f'train epoch: {self.epoch}'):
      logit, loss = self.model(batch)
      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()
      total_train_loss = float(loss)
      train_steps += 1
      
      if step % self.val_check_step == 0 and step != 0:
        wandb.log({'train_loss': (total_train_loss/train_steps)})
        self.valid_step(1)
        
  def valid_step(self, epoch):
    self.model.eval()
    total_val_loss = 0
    val_steps = 0
    with torch.no_grad():
      for step, batch in tqdm(enumerate(self.valid_loader), desc='valid step'):
        _, loss = self.model(batch)
        total_val_loss += float(loss)
        val_steps += 1

    wandb.log({'val_loss': (total_val_loss/val_steps)})
    
  def fit(self):
    for epoch in range(1):
      self.train_step()

In [None]:
model = Model(args)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [None]:
trainer = Trainer(
    args=args,
    model=model,
    train_loader=test_loader,
    valid_loader=test_loader
)

In [None]:
import wandb

wandb.init('Topic Classification')

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
trainer.fit()

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
valid step: 6120it [13:29,  7.51it/s][A
valid step: 6121it [13:29,  7.49it/s][A
valid step: 6122it [13:29,  7.49it/s][A
valid step: 6123it [13:29,  7.49it/s][A
valid step: 6124it [13:29,  7.50it/s][A
valid step: 6125it [13:29,  7.51it/s][A
valid step: 6126it [13:30,  7.54it/s][A
valid step: 6127it [13:30,  7.55it/s][A
valid step: 6128it [13:30,  7.54it/s][A
valid step: 6129it [13:30,  7.57it/s][A
valid step: 6130it [13:30,  7.53it/s][A
valid step: 6131it [13:30,  7.53it/s][A
valid step: 6132it [13:30,  7.51it/s][A
valid step: 6133it [13:31,  7.52it/s][A
valid step: 6134it [13:31,  7.52it/s][A
valid step: 6135it [13:31,  7.57it/s][A
valid step: 6136it [13:31,  7.58it/s][A
valid step: 6137it [13:31,  7.59it/s][A
valid step: 6138it [13:31,  7.51it/s][A
valid step: 6139it [13:31,  7.49it/s][A
valid step: 6140it [13:31,  7.50it/s][A
valid step: 6141it [13:32,  7.52it/s][A
valid step: 6142it [13:32,  7.53it/s][A
valid s