<a href="https://colab.research.google.com/github/newfull5/AI-Project/blob/master/nlp_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb transformers datasets torch tqdm

In [None]:
import argparse
  
def _get_parser():  
    parser = argparse.ArgumentParser()
    parser.add_argument()
    return parser


args = argparse.Namespace(
  model_name="klue/bert-base",
  tokenizer_name="klue/bert-base",
  dataset_name="nsmc",
  save_dir_path="./",
  val_check_step=2000,
  batch_size=4,
  max_epochs=3,
  patient=3,
  lr=3e-5
)

In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

class NsmcDataset(Dataset):
  def __init__(self, args, stage):
    super().__init__()
    self.stage = stage
    self.args = args
    self.save_dir = args.save_dir_path
    self.documents, self.labels = self._get_data(args, stage)

  def _get_data(self, args, stage):
    dataset = load_dataset(args.dataset_name)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)

    if stage == 'train':
      documents = dataset['train']['document']
      labels = dataset['train']['label']
    elif stage == 'valid':
      documents = dataset['test']['document'][:25000]
      labels = dataset['test']['label'][:25000]
    elif stage == 'test':
      documents = dataset['test']['document'][25000:]
      labels = dataset['test']['label'][25000:]
    else:
      raise Exception("you can set stage only 'train', 'test' or 'valid'")

    return [self._preproc(tokenizer, text) for text in documents], labels

  def _preproc(self, tokenizer, text):
    tokenized = tokenizer(
      text=text,
      padding='max_length',
      truncation=True,
      return_tensors='pt'
    )
    return tokenized
      
  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.documents[idx], self.labels[idx]

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    dataset = NsmcDataset(args, 'train'),
    batch_size = args.batch_size,
    shuffle = True
)

valid_loader = DataLoader(
    dataset = NsmcDataset(args, 'valid'),
    batch_size = args.batch_size,
    shuffle = True
)

test_loader = DataLoader(
    dataset = NsmcDataset(args, 'test'),
    batch_size = args.batch_size,
    shuffle = True
)

Downloading builder script:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading and preparing dataset nsmc/default to /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset nsmc downloaded and prepared to /root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from transformers import AutoModelForSequenceClassification
import torch
from torch import nn

class Model(nn.Module):
  def __init__(self, args):
    super(Model, self).__init__()
    self.model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=2)
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.save_dir = args.save_dir_path
    self.criterion = nn.CrossEntropyLoss()

  def forward(self, batch):
    inputs, labels = batch
    inputs['input_ids'] = self._move_to_cuda(inputs['input_ids'].squeeze())
    inputs['attention_mask'] = self._move_to_cuda(inputs['attention_mask'])
    labels = self._move_to_cuda(labels)

    outputs = self.model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
    )
    loss = self.criterion(outputs.logits, labels)

    return outputs.logits, loss

  def _move_to_cuda(self, inputs):
    if torch.is_tensor(inputs):
      return inputs.to(self.device)
    elif isinstance(inputs, list):
      return [self._move_to_cuda(x) for x in inputs]
    elif isinstance(inputs, dict):
      return {key: self._move_to_cuda(value) for key, value in inputs.items()}
    else:
      return inputs

  def save(self):
    self.model.save_pretrained(self.save_dir)

  def load(self):
    self.model.load_state_dict(
        torch.load(self.save_dir + 'pytorch_model.bin', map_location=torch.device(self.device))
    )

In [None]:
from tqdm import tqdm

class Trainer:
  def __init__(self, args, model, train_loader, valid_loader):
    self.model = model
    self.train_loader = train_loader
    self.valid_loader = valid_loader
    self.max_epochs = args.max_epochs
    self.val_check_step = args.val_check_step
    self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=args.lr)
    self.epoch = 0

  def train_step(self):
    self.model.train()
    for step, batch in tqdm(enumerate(self.train_loader), desc=f'train epoch: {self.epoch}'):
      logit, loss = self.model(batch)
      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()
      
      if step % self.val_check_step == 0 and step != 0:
        pass
        
  def valid_step(self):
    self.model.eval()
    for step, batch in tqdm(enumerate(self.valid_loader), desc='valid step'):
      aa

  def fit(self):
    self.train_step()
    return

In [None]:
model = Model(args)

In [None]:
trainer = Trainer(
    args=args,
    model=model,
    train_loader=test_loader,
    valid_loader=''
)

In [None]:
trainer.fit()

In [None]:
list(model.parameters())[100]

In [None]:
print()




In [None]:
import random

['F']

In [None]:
# M -> male
# F -> female



num_male = 0
num_female = 0


while num_female < 1_000_000:
  while True:
    element = random.sample(['M', 'F'], 1)
    if element == ['M']:
      num_male += 1
    if element == ['F']:
      num_female += 1
      break

In [None]:
num_male

1000123

In [None]:
num_female

1000000