<a href="https://colab.research.google.com/github/obara13/ml-test/blob/master/pytorch-bert-classification-test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install pandas torch transformers

# Load training data

In [0]:
import pandas as pd

DATA_PATH = 'data/'
df_train = pd.read_csv(DATA_PATH + 'train.csv')
#df_train

# Remove unnecessary words 

In [0]:
import re

pattern = re.compile(r'https?://\S*|@\S*')

for i in range(df_train.shape[0]):
  df_train.loc[i, 'text']  = pattern.sub('', df_train.loc[i, 'text'])


# Check CPU or GPU

In [95]:
import torch
import torch.optim as optim
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cuda


# Load model

In [0]:
from transformers import BertTokenizer, BertConfig, BertModel, BertForSequenceClassification

#model_type = 'bert-base-multilingual-cased'
model_type = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_type)
#config = BertConfig.from_json_file(BERT_CONFIG_PATH)
model = BertForSequenceClassification.from_pretrained(model_type)

#print(model)


# Set classifier and bert last layer for trainable (require_grad = True), others for un-trainable.

In [0]:
for name, param in model.named_parameters():
    param.requires_grad = False

for name, param in model.bert.encoder.layer[-1].named_parameters():
    param.requires_grad = True

for name, param in model.classifier.named_parameters():
    param.requires_grad = True

# Prepare train data (pandas dataframe -> tensor dataloader)


In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

ids = []
labels = []
max_length = 32
batch_size = 32

for i in range(df_train.shape[0]):
  ids.append(tokenizer.encode(df_train.loc[i, 'text'],
                              max_length=max_length, pad_to_max_length=True))
  labels.append(df_train.loc[i, 'target'])

input_ids = torch.tensor(ids).to(device)
input_labels = torch.tensor(labels).to(device)

train_data = TensorDataset(input_ids, input_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# Training

In [99]:
import time

t0 = time.time()

optimizer = optim.Adam(model.parameters())
epochs = 20

for epoch in range(epochs):
  total_loss = 0
  total_eval = 0
  correct = 0
  for step, batch in enumerate(train_dataloader):
    if step*batch_size/df_train.shape[0] < 0.7:  # train
      model.train().to(device)
      optimizer.zero_grad()
      outputs = model(batch[0], labels=batch[1])
      loss = outputs[0]
      total_loss += loss.item()
      loss.backward()
      optimizer.step()
      #if step % 100 == 0:
      #  print(step, ':%6.1f' % (time.time() - t0), ':', loss.item())  
    else:   # eval
      model.eval().to(device)
      with torch.no_grad():
        outputs = model(batch[0])
      for i, pred in enumerate(outputs[0]):
        total_eval += 1
        if pred.argmax().item() == batch[1][i].item():
          correct += 1

  print(epoch, ': %6.1f' % (time.time() - t0),
        ', loss: ', total_loss,
        ', accr: ', correct/total_eval)


0 :   10.8 , loss:  82.2471664249897 , accr:  0.8267959453503746
1 :   21.5 , loss:  71.35702998936176 , accr:  0.8289995592772146
2 :   32.1 , loss:  69.79170575737953 , accr:  0.8439841339797267
3 :   42.8 , loss:  65.35116255283356 , accr:  0.8329660643455267
4 :   53.3 , loss:  64.65896537899971 , accr:  0.8497135301895108
5 :   64.0 , loss:  64.34797659516335 , accr:  0.8497135301895108
6 :   74.6 , loss:  63.37654058635235 , accr:  0.8514764213309828
7 :   85.3 , loss:  58.50173069536686 , accr:  0.8391361833406787
8 :   95.9 , loss:  62.504255175590515 , accr:  0.8594094314676068
9 :  106.5 , loss:  60.94529316574335 , accr:  0.8501542529748788
10 :  117.2 , loss:  59.561195224523544 , accr:  0.8554429263992949
11 :  127.9 , loss:  58.41919145733118 , accr:  0.8501542529748788
12 :  138.7 , loss:  58.65693661570549 , accr:  0.8655795504627589
13 :  149.3 , loss:  57.52973356842995 , accr:  0.8682238871749669
14 :  160.0 , loss:  56.5433104634285 , accr:  0.872190392243279
15 :  