In [9]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [10]:
!pip install pandas
!pip install transformers






In [11]:
import torch
import pandas as pd
from transformers import *
from IPython.display import clear_output


PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch 版本：", torch.__version__)

vocab = tokenizer.vocab
print("字典大小：", len(vocab))

PyTorch 版本： 1.5.0+cu101
字典大小： 21128


In [0]:
from torch.utils.data import Dataset



pos_train_filename = "/content/drive/My Drive/colab/bert/suicdie_dataset/normal.txt"
neg_train_filename = "/content/drive/My Drive/colab/bert/suicdie_dataset/die.txt"

pos_test_filename="/content/drive/My Drive/colab/bert/suicdie_dataset/normal_test.txt"
neg_test_filename="/content/drive/My Drive/colab/bert/suicdie_dataset/die_test.txt"

col_name=['content']


class SuicideDataset(Dataset):
    
  def __init__(self, mode, tokenizer):

    assert mode in ["train", "test"]  
    self.mode = mode
    
    if mode =="train":

      pos_train_df = pd.read_table(pos_train_filename,names=col_name).sample(50)
      neg_train_df = pd.read_table(neg_train_filename,names=col_name).sample(50)
  
      pos_train_df['label'] = 0
      neg_train_df['label'] = 1

      self.train_df = pd.concat([pos_train_df,neg_train_df], axis=0, ignore_index=True)
      self.train_df = self.train_df.sample(len(self.train_df))
      self.len = len(self.train_df)
    else:
      pos_test_df = pd.read_table(pos_test_filename,names=col_name)
      neg_test_df = pd.read_table(neg_test_filename,names=col_name)
      self.test_df = pd.concat([pos_test_df,neg_test_df], axis=0, ignore_index=True)
      self.test_df = self.test_df.sample(len(self.test_df))
      self.len = len(self.test_df)
      self.p_len=len(pos_test_df)
      self.n_len=len(neg_test_df)

    self.tokenizer = tokenizer  
    
  #@pysnooper.snoop()
  def __getitem__(self, idx):

    if self.mode == "test":
      text= self.test_df.iloc[idx, :].values
      label_tensor = None
    else:
      text,label = self.train_df.iloc[idx, :].values
      
      label_id = label
      label_tensor = torch.tensor(label_id).float()
            
    text=str(text)
    word_pieces = ["[CLS]"]
    tokens = self.tokenizer.tokenize(text)
    word_pieces += tokens + ["[SEP]"]
    len_t= len(word_pieces)
        
    
    
    ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
    tokens_tensor = torch.tensor(ids).float()
    
    
    segments_tensor = torch.tensor([0] * len_t).float()
                                        
        
    return (tokens_tensor, segments_tensor, label_tensor) #要特別注意這三個參數的資料型態
    
  def __len__(self):
      return self.len

#設定訓練與測試的資料集    

trainset = SuicideDataset("train", tokenizer=tokenizer)
testset = SuicideDataset("test", tokenizer=tokenizer)

In [15]:
# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
text,label = trainset.train_df.iloc[sample_idx].values

#print(testset.test_df)

#testset.test_df.apply(toStr)
tokens_tensor, segments_tensor, label_tensor = testset[sample_idx]

#print(tokens_tensor)
# 利用剛剛建立的 Dataset 取出轉換後的 id tensors

tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

#print(type(tokens_tensor[0].item()))
#print(type(segments_tensor[0].item()))
#print(type(label_tensor.item()))

print(tokens_tensor.type())
print(segments_tensor.type())
print(label_tensor.type())
# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
print(f"""[原始文本]
句子 1：{text}

分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

torch.FloatTensor
torch.FloatTensor
torch.FloatTensor
[原始文本]
句子 1：#天天向上# 【今晚十点见】#周五不下班# 今天大舅妈换衣服了吗？[doge].................................换了！！！[憧憬][憧憬][憧憬]@王鸥Angel 

分類  ：0

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([  101.,   108.,  1921.,  1921.,  1403.,   677.,   108.,   523.,   791.,
         3241.,  1282.,  4157.,  6224.,   524.,   108.,  1453.,   758.,   679.,
          678.,  4408.,   108.,   791.,  1921.,  1920.,  5643.,  1968.,  2940.,
         6132.,  3302.,   749.,  1408.,  8043.,   138., 13030.,  8154.,   140.,
          119.,   119.,   119.,   119.,   119.,   119.,   119.,   119.,   119.,
          119.,   119.,   119.,   119.,   119.,   119.,   119.,   119.,   119.,
          119.,   119.,   119.,   119.,   119.,   119.,   119.,   119.,   119.,
          119.,   119.,   119.,   119.,   119.,   119.,  2940.,   749.,  8013.,
         8013.,  8013.,   138.,  2735.,  2739.,   140.,   138.,  2735.,  2739.,
          140.,   138.,  2735.,  2739.,   140.

In [0]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def create_mini_batch(samples):

  samples_m=[]
  for i,s in enumerate(samples):
    #print(i,"-->",len(s[0]))
    if len(s[0])<300: #過濾掉分詞之後字數超過300的資料，就bert目前其實也只接受最多512個字數
      samples_m.append(s)

  tokens_tensors = [s[0] for s in samples_m]
  segments_tensors = [s[1] for s in samples_m]
    
  # 訓練集有 labels
  if samples[0][2] is not None:
      label_ids = torch.stack([s[2] for s in samples_m])
  else:
      label_ids = None
    
  # zero pad 到同一序列長度
  tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
  segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
  # attention masks，將 tokens_tensors 裡頭不為 zero padding
  # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
  masks_tensors = torch.zeros(tokens_tensors.shape)
                                
  masks_tensors = masks_tensors.masked_fill(
      tokens_tensors != 0, 1).float()
    
  return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 16 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [17]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data
print(len(data[0][0]))
print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")

230

tokens_tensors.shape   = torch.Size([16, 230]) 
tensor([[ 101.,  108., 1921.,  ...,    0.,    0.,    0.],
        [ 101., 2769., 3297.,  ...,    0.,    0.,    0.],
        [ 101., 2207., 4432.,  ...,    0.,    0.,    0.],
        ...,
        [ 101., 3209., 3209.,  ...,    0.,    0.,    0.],
        [ 101., 2769., 6444.,  ...,    0.,    0.,    0.],
        [ 101.,  100., 4376.,  ...,    0.,    0.,    0.]])
------------------------
segments_tensors.shape = torch.Size([16, 230])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
------------------------
masks_tensors.shape    = torch.Size([16, 230])
tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1

In [18]:
# 以一維的數值作為output
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 1

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME,num_labels=NUM_LABELS)

clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=1, bias=True)


In [0]:
config={}

model.config

In [0]:


def get_predictions(model, dataloader, compute_acc=False):


  #predictions = None
  predictions=[]
  correct = 0
  total = 0
  model.eval()
  sum_len=0
  dataset_num=len(dataloader.dataset)

  with torch.no_grad():
    
  #print(len(dataloader))
    for data in dataloader:
      #if next(model.parameters()).is_cuda:
      #print(len(dataloader))
      #data = [t.to("cuda:0") for t in data if t is not None]
      #print(len(dataloader.dataset))

      if next(model.parameters()).is_cuda:
        data = [t.to("cuda:0") for t in data if t is not None]
        
      tokens_tensors, segments_tensors, masks_tensors=data[:3]

      #print(len(tokens_tensors))
      sum_len+=len(tokens_tensors)
      #print((sum_len/dataset_num)*100,'%')
      #print(len(segments_tensors[0]))
      #print(len(masks_tensors[0]))
      #print(len(data))

      outputs = model(input_ids=tokens_tensors.long(), 
                      token_type_ids=segments_tensors.long(), 
                      attention_mask=masks_tensors.long(),
                      
                      )
            
      logits = outputs[0]
      
      pred=logits.data

      pred_s = torch.sigmoid(pred)
      print(pred_s)

      for p in range(0,len(pred_s)):
        if pred_s[p]>0.7: #將輸出超過0.7的數值將其設為標籤"1"其餘為"0"
          pred_s[p]=1
        else:
          pred_s[p]=0
      
      print(pred_s)

      z=0
      o=0
      for s in pred_s:
        if(s==0.0):
          z=z+1
        else:
          o=o+1

      print("z:",z,"o:",o)
      pred_s=torch.squeeze(pred_s)
      
      
      if compute_acc:
        labels = data[3]
        total += labels.size(0)
        correct += (pred_s == labels).sum().item()
     
   
    if compute_acc:#算出精準度
      #print(total)
      acc = correct / total
      return predictions, acc
    return predictions
    



In [20]:
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)


device: cuda:0


In [0]:
_, acc = get_predictions(model, trainloader, compute_acc=True) 
print("classification acc:", acc) #測試是否可以預測

In [0]:
# 訓練模式
model.train()

#torch.set_default_tensor_type(torch.cuda.FloatTensor)
# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1.0e-4) #, lr=1.0e-4
#optimizer = AdamW(model.parameters(), lr=1e-3, correct_bias=False)
#sum_t=0

EPOCHS = 6  # 幸運數字
for epoch in range(EPOCHS):
    sum_t=0
    running_loss = 0.0
    for data in trainloader:
        
        #sum_t+=str(len(tokens_tensors)

        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        sum_t+=len(tokens_tensors)

        #print(type(labels[0].item()))
        print("epoch:"+str(epoch)+","+str((sum_t)/len(trainloader.dataset)*100)+"%")
        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors.long(), 
                        token_type_ids=segments_tensors.long(), 
                        attention_mask=masks_tensors.long(), 
                        labels=labels)

        loss = outputs[0]
        #print(loss)
        #print(outputs[1].type())
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

In [0]:
testset = SuicideDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=300, 
                        collate_fn=create_mini_batch)

sample_idx=0
text= testset.test_df.iloc[sample_idx].values
#print(text)
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]
print(testset.p_len)
print(testset.n_len)
#print(text)
#tokens = tokenizer.tokenize(text)
#testset[sample_idx]


In [0]:
p = get_predictions(model, testloader)

#利用訓練好的模型進行預測測試資料集
