#Import

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [9

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

import json
import numpy as np
import pandas as pd

# Data Prepare

## One Hot Encoding


In [None]:
def OneHotEncoding(label):
  # get label without one-hot-encoding
  y_data = []
  for i in label:
    if '{' in i:
      a = json.loads(i)
      y_data.append(a['choices'])
    else:
      y_data.append(i)

  # the top 5 most frequently label
  top5 = ['課業', '蓋樓', '生活', '問題', '食物']

  # merge simliar label
  sim_to_edu = ['考試', '選課']
  sim_to_live = ['遊戲', '天氣', '宿舍', '疫情', '動漫', '假期', '活動', '拔', '節日']

  for idx, i in enumerate(y_data):
    if type(i) is list:
      for idx_i, j in enumerate(i):
        if j in sim_to_edu:
          i[idx_i] = '課業'
        elif j in sim_to_live:
          i[idx_i] = '生活'
        elif j not in top5:
          i[idx_i] = '其他'
    else:
      if i in sim_to_edu:
        y_data[idx] = '課業'
      elif i in sim_to_live:
        y_data[idx] = '生活'
      elif i not in top5:
        y_data[idx] = '其他'

  # transfer the label to one-hot-encoding vector
  result = []
  encoding_label = ['課業', '蓋樓', '生活', '問題', '食物', '其他']
  for i in y_data:
    encoding_arr = np.zeros(6)
    if type(i) is list:
        for j in i:
            encoding_arr[encoding_label.index(j)] = 1
    else:
        encoding_arr[encoding_label.index(i)] = 1
    result.append(encoding_arr)

  return result

# DataSet

## DataSet Define

In [None]:
class Dataset():
  def __init__(self, Content, Label, num):
    self.x = Content
    self.y = Label
    self.n_samples = num

  # working for indexing
  def __getitem__(self, index):
    return self.x[index], self.y[index]

  # return the length of our dataset
  def __len__(self):
    return self.n_samples


## DataSet create

In [None]:
# load CSV
df = pd.read_csv('NCU_dataset.csv')
content = df['content'].tolist()

# do one-hot-encoding for label
label = OneHotEncoding(df['sentiment'])

# split dataset
x_train, x_test, y_train, y_test = train_test_split(content, label, test_size=0.3, random_state=1)

# replace the empty of content with 'No Content'
for i in range(len(x_train)):
  if type(x_train[i]) is float:
    x_train[i] = 'No Content'

for i in range(len(x_test)):
  if type(x_test[i]) is float:
    x_test[i] = 'No Content'

# create train and test dataset
train_set = Dataset(x_train, y_train, len(x_train))
test_set = Dataset(x_test, y_test, len(x_test))

## DataLoader

In [None]:
train_dataloader = DataLoader(dataset=train_set, batch_size=16, shuffle=True)
test_dataloader = DataLoader(dataset=test_set, batch_size=16, shuffle=True)


# Model



## import package

In [None]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup,  BertTokenizer
import torch.nn as nn


## Use GPU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Hyper Paramaters

In [None]:
epoch = 10
learnrate = 1e-4

## Model Define

In [None]:
# Define model
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=6)
model.to(device)

# Define tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# Define optimizer
optim = AdamW(model.parameters(), lr=learnrate)

# Define Loss function
criterion = nn.BCEWithLogitsLoss()

# Use Warm_up
total_steps = len(train_dataloader) * epoch
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps = 0, num_training_steps = total_steps)

Downloading pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Training Function

In [None]:
def train(model, iterator, optimizer, criterion, total, device):
  model.train()
  train_loss = 0
  for batch_idx, (sentences, labels) in enumerate(iterator):
    # tokenize the sentences
    encoding = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # move to GPU
    input_ids, labels , attention_mask = input_ids.to(device), labels.to(device), attention_mask.to(device)

    # generate prediction
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask=attention_mask, labels = labels)

    # compute gradients and accumulate train loss
    loss = criterion(outputs.logits, labels) # BCEWithLogitsLoss has sigmoid
    train_loss += loss

    # 反向梯度信息
    loss.backward()

    # 梯度截斷，處理梯度爆炸
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # 參數更新
    optimizer.step()
    scheduler.step()

  # print completed result
  print('train_loss: %f' % (train_loss))
  print(scheduler.get_last_lr())
  return train_loss

# Testing Function

In [None]:
def test(model, iterator, optimizer, criterion, total, device):
  model.eval()

  with torch.no_grad():
    acc_batch = 0
    tp = [0]*6
    fp = [0]*6
    fn = [0]*6
    tn = [0]*6
    for batch_idx, (sentences, labels) in enumerate(iterator):

      # tokenize the sentences
      encoding = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
      input_ids = encoding['input_ids']
      attention_mask = encoding['attention_mask']

      # move to GPU
      input_ids, labels , attention_mask = input_ids.to(device), labels.to(device), attention_mask.to(device)

      # generate prediction
      outputs = model(input_ids, attention_mask=attention_mask)
      prob = outputs.logits.sigmoid()

      # translate the prediction to 0 or 1
      THRESHOLD = 0.3
      predicition = prob.detach().clone()
      predicition[predicition > THRESHOLD] = 1
      predicition[predicition <= THRESHOLD] = 0

      # calculate the accuracy
      # the acc_batch result is the accuracy of one batch
      acc_batch += acc_calculate(predicition, labels)

      # confusion matrix
      CMC(tp, fp, fn, tn, predicition, labels)

  # got accuracy of total by divide acc_batch with the length of testdata size
  acc = acc_batch/total

  # calculate recall and precision
  precision = []
  recall = []
  for i in range(6):
    if tp[i]+fp[i] != 0:
      precision.append(tp[i]/(tp[i]+fp[i]))
    else:
      precision.append(0)

    if tp[i]+fn[i] != 0:
      recall.append(tp[i]/(tp[i]+fn[i]))
    else:
      recall.append(0)

  # print the result
  encoding_label = ['課業', '蓋樓', '生活', '問題', '食物', '其他']
  print('test_acc: %f' % (acc))
  for i in range(len(encoding_label)):
    print('%s precision: %f recall: %f' % (encoding_label[i], precision[i], recall[i]))

  return acc


## Acc Calculate

In [None]:
def acc_calculate(preds, labels):
  correct = 0
  for pred, label in zip(preds, labels):
    union = 0
    inter = 0
    for i in range(len(pred)):
      if pred[i] == 1 and label[i] == 1:
        inter += 1
      if pred[i] == 1 or label[i] == 1:
        union += 1
    correct += (inter/union)
  return correct


## Confusion Matrix Calculate

In [None]:
# 課業, 蓋樓, 生活, 問題, 食物, 其他
def CMC(tp, fp, fn, tn, preds, labels):
  for pred, label in zip(preds, labels):
    for i in range(len(label)):
      if pred[i] == 0 and label[i] == 0:
        fn[i] += 1
      if pred[i] == 0 and label[i] == 1:
        tn[i] += 1
      if pred[i] == 1 and label[i] == 0:
        fp[i] += 1
      if pred[i] == 1 and label[i] == 1:
        tp[i] += 1

# Start Training

In [None]:
for e in range(epoch):

  print("===== Epoch %i =====" % e)

  # training
  print("Training started ...")
  train(model, train_dataloader, optim, criterion, len(train_dataloader), device)

  # validation testing
  print("Testing started ...")
  test(model, test_dataloader, optim, criterion, len(x_test), device)

===== Epoch 0 =====
Training started ...
train_loss: 103.401580
[9e-05]
Testing started ...
test_acc: 0.791932
課業 precision: 0.817857 recall: 0.282716
蓋樓 precision: 0.936925 recall: 0.272866
生活 precision: 0.759442 recall: 0.612752
問題 precision: 0.839506 recall: 0.135404
食物 precision: 0.864706 recall: 0.114441
其他 precision: 0.682243 recall: 0.029142
===== Epoch 1 =====
Training started ...
train_loss: 66.815268
[8e-05]
Testing started ...
test_acc: 0.807625
課業 precision: 0.815436 recall: 0.296341
蓋樓 precision: 0.952000 recall: 0.271896
生活 precision: 0.782819 recall: 0.594943
問題 precision: 0.760314 recall: 0.154738
食物 precision: 0.868098 recall: 0.110504
其他 precision: 0.619883 recall: 0.042282
===== Epoch 2 =====
Training started ...
train_loss: 50.251184
[7e-05]
Testing started ...
test_acc: 0.823662
課業 precision: 0.871698 recall: 0.278649
蓋樓 precision: 0.959294 recall: 0.269333
生活 precision: 0.834862 recall: 0.563218
問題 precision: 0.766791 recall: 0.162966
食物 precision: 0.802993 recall

# Save Model

In [None]:
name = 'Bahamut_NCU.pt'
path = F"/content/drive/MyDrive/專題/{name}"
torch.save(model, path)