<a href="https://colab.research.google.com/github/newfull5/AI-Project/blob/master/nlp_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb transformers datasets torch tqdm

In [None]:
import argparse

def _get_parser():  
    parser = argparse.ArgumentParser()
    parser.add_argument()
    return parser
   

args = argparse.Namespace(  
  model_name="klue/bert-base", 
  tokenizer_name="klue/bert-base",
  dataset_name="nsmc",
  val_check_step=2000,
  batch_size=8,
  max_epochs=10,
  patient=3, 
  lr=3e-5
)

In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

class NsmcDataset(Dataset):
  def __init__(self, args, stage):
    super().__init__()
    self.stage = stage
    self.args = args
    self.documents, self.labels = self._get_data(args, stage)

  def _get_data(self, args, stage):
    dataset = load_dataset(args.dataset_name)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)

    if stage == 'train':
      documents = dataset['train']['document']
      labels = dataset['train']['label']
    elif stage == 'valid':
      documents = dataset['test']['document'][:25000]
      labels = dataset['test']['label'][:25000]
    elif stage == 'test':
      documents = dataset['test']['document'][25000:]
      labels = dataset['test']['label'][25000:]
    else:
      raise Exception("you can set stage only 'train', 'test' or 'valid'")

    return [self._preproc(tokenizer, text) for text in documents], labels

  def _preproc(self, tokenizer, text):
    tokenized = tokenizer(
      text=text,
      padding='max_length',
      truncation=True,
      return_tensors='pt'
    )
    return tokenized
      
  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.documents[idx], self.labels[idx]

In [None]:
from transformers import AutoModelForSequenceClassification
import torch
from torch import nn

class Model(nn.Module):
  def __init__(self, args):
    super(Model, self).__init__()
    self.model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=2)
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.criterion = nn.CrossEntropyLoss()
    self.model.to(self.device)

  def forward(self, batch):
    inputs, labels = batch
    inputs['input_ids'] = self._move_to_cuda(inputs['input_ids'].squeeze())
    inputs['attention_mask'] = self._move_to_cuda(inputs['attention_mask'])
    labels = self._move_to_cuda(labels)

    outputs = self.model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
    )
    loss = self.criterion(outputs.logits, labels)

    return outputs.logits, loss

  def _move_to_cuda(self, inputs):
    if torch.is_tensor(inputs):
      return inputs.to(self.device)
    elif isinstance(inputs, list):
      return [self._move_to_cuda(x) for x in inputs]
    elif isinstance(inputs, dict):
      return {key: self._move_to_cuda(value) for key, value in inputs.items()}
    else:
      return inputs

  def save(self):
    self.model.save_pretrained(self.save_dir)

  def load(self):
    self.model.load_state_dict(
        torch.load(self.save_dir + 'pytorch_model.bin', map_location=torch.device(self.device))
    )

In [None]:
from tqdm import tqdm
import wandb

class Trainer:
  def __init__(self, args, model, train_loader, valid_loader):
    self.model = model
    self.args = args
    self.train_loader = train_loader
    self.valid_loader = valid_loader
    self.max_epochs = args.max_epochs
    self.val_check_step = args.val_check_step
    self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=args.lr)
    self.epoch = 0
    self.global_steps = 0

  def train_step(self):
    self.model.train()
    total_train_loss = 0
    train_steps = 0
    for step, batch in tqdm(enumerate(self.train_loader), desc=f'train epoch: {self.epoch}'):
      logit, loss = self.model(batch)
      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()
      total_train_loss = float(loss)
      train_steps += 1
      self.global_steps += 1

      if step % self.val_check_step == 0 and step != 0:
        wandb.log({
            'train_loss': (total_train_loss/train_steps)
            })
        self.valid_step()

    else:
      self.epoch += 1

  def valid_step(self):
    self.model.eval()
    total_val_loss = 0
    val_steps = 0
    pred = []
    labels = []
    with torch.no_grad():
      for step, batch in enumerate(self.valid_loader):
        logits, loss = self.model(batch)
        total_val_loss += float(loss)
        val_steps += 1

        _, label = batch
        labels += label.tolist()
        pred += logits.argmax(dim=1).tolist()

    tp,tn,fp,fn = self._confusion_matrix(pred, labels)
    precision = self._calc_precision(tp,tn,fp,fn)
    recall = self._calc_recall(tp,tn,fp,fn)
    f1_score = 2 * precision * recall / (precision + recall)

    wandb.log({
        'val_loss': (total_val_loss/val_steps),
        'val_precision': precision,
        'val_recall': recall,
        'val_f1score': f1_score,
        'steps': self.global_steps
        })
    
    self.model.save(f"./nsmc_ckpt_steps_{self.global_stpes}/")
    
  def _confusion_matrix(self, pred, labels):
    tp, tn, fp, fn = 0, 0, 0, 0
    
    for i in range(len(labels)):
      if pred[i] == 1:
        if pred[i] == labels[i]:
          tp += 1
        if pred[i] != labels[i]:
          fp += 1

      elif pred[i] == 0:
        if pred[i] == labels[i]:
          tn += 1
        if pred[i] != labels[i]:
          fn += 1

    return tp, tn, fp, fn

  def _calc_precision(self, tp, tn, fp, fn):
    if tp + fn == 0:
      return 1
    return tp / (tp + fp)

  def _calc_recall(self, tp, tn, fp, fn):
    if tp + fn == 1:
      return 1
    return tp / (tp + fn)

  def fit(self):
    for epoch in range(self.args.max_epochs):
      self.train_step()

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    dataset = NsmcDataset(args, 'train'),
    batch_size = args.batch_size,
    shuffle = True
)

valid_loader = DataLoader(
    dataset = NsmcDataset(args, 'valid'),
    batch_size = args.batch_size,
    shuffle = True
)

test_loader = DataLoader(
    dataset = NsmcDataset(args, 'test'),
    batch_size = args.batch_size,
    shuffle = True
)



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model = Model(args)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [None]:
trainer = Trainer(
    args=args,
    model=model,
    train_loader=train_loader,
    valid_loader=valid_loader
)

In [None]:
import wandb

wandb.init('Topic Classification')

In [None]:
trainer.fit()

train epoch: 0: 2000it [40:46,  1.22s/it]


AttributeError: ignored