In [1]:
#Method is use BERT classifier
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

#Firstly, use pandas to sepearte text and label
df_train = pd.read_csv('train_data.txt', sep=";", header=None)
df_val = pd.read_csv('val_data.txt', sep=";", header=None)
#rename column to text and elabel
df_train.columns=['text','elabel']
df_val.columns=['text','elabel']


In [3]:
#convert text to token by BertTokenizer
from transformers import BertTokenizer

#load berttokenizaer, 12-layer BERT model, with uncased vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids_train, attention_masks_train = [], []

#tokenize ['text'] in df_train 
for text in df_train['text']:
    encoded_sent = tokenizer.encode_plus(
            text=text,                      #text sentence 
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=150,                  #max lenght sentense input =150
            pad_to_max_length=True,         #Pad sentence to max length
            return_attention_mask=True      #Return attention mask
            )
        
    input_ids_train.append(encoded_sent.get('input_ids'))
    attention_masks_train.append(encoded_sent.get('attention_mask'))

#to tensor
input_ids_train = torch.tensor(input_ids_train)
attention_masks_train = torch.tensor(attention_masks_train)


#tokenize ['text'] in df_val 
input_ids_val, attention_masks_val = [], []

for text in df_val['text']:
    encoded_sent = tokenizer.encode_plus(
            text=text,                      #text sentence 
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=150,                  #max lenght sentense input =150
            pad_to_max_length=True,         #Pad sentence to max length
            return_attention_mask=True      #Return attention mask
            )

    input_ids_val.append(encoded_sent.get('input_ids'))
    attention_masks_val.append(encoded_sent.get('attention_mask'))
    
#to tensor
input_ids_val = torch.tensor(input_ids_val)
attention_masks_val = torch.tensor(attention_masks_val)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [4]:
#convert elabel to tensor at test and val dataset, labelencoder assign the category by number set at list below, this list will use again 
#in gui to get what number is what emotion
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_val['elabel'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)
val_label = torch.tensor(le.fit_transform(df_val['elabel'].values))
train_label = torch.tensor(le.fit_transform(df_train['elabel'].values))
#print(le.inverse_transform(val_label))
#print(le.inverse_transform(train_label))

{'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}


In [5]:
#set up cuda if available, elase use cpu

if torch.cuda.is_available():       
    device = torch.device("cuda")
    #deafult use gpu0
    print("Use cuda device:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("use cpu")

Use cuda device: NVIDIA GeForce RTX 3060 Laptop GPU


In [6]:
batch_size=16    #can run on 6gbvram

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#create dataloader for train and val
train_data = TensorDataset(input_ids_train, attention_masks_train, train_label)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(input_ids_val, attention_masks_val, val_label)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


In [7]:
# create classifier by bert
from transformers import BertModel

class emotionClassifier(nn.Module):
    def __init__(self):
         super(emotionClassifier, self).__init__()
         self.bert = BertModel.from_pretrained('bert-base-uncased')     #use bert-base-uncased

        #Classifier use hidden neron fc 768 ->48 relu -> 6 (6 label)
         self.classifier = nn.Sequential(
            nn.Linear(768, 48),
            nn.ReLU(),
            nn.Linear(48, 6)
        )
    
    def forward(self, input_ids, attention_mask):
        # input to BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # get [CLS] at last hidden layer output for classifier to use
        cls_from_bert = outputs[0][:, 0, :]

        # put cls_from_bert into our classifier
        result = self.classifier(cls_from_bert)

        return result


In [8]:
#Optimizatize the model

from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs):
    #start classifier
    bert_classifier = emotionClassifier()
    bert_classifier.to(device)  #put classifier into device choosed before, gpu for my case
    #use AdamW optimization
    optimizer = AdamW(bert_classifier.parameters(),
                    lr=5e-5,    # learning rate
                    eps=1e-8    # epsilon value
                    )

    total_training_steps = len(train_dataloader) * epochs      #define training step
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_training_steps)
    
    return bert_classifier, optimizer, scheduler

In [9]:

loss_function = nn.CrossEntropyLoss()       #use crossentropyloss

def train(model, train_dataloader, epochs):
    print("training initialized")
    print("##############################")
    train_time_start = time.time()      #time for whole trining
    
    for i in range(epochs):
        epochs_no = i+1
        print(str(epochs_no)+"/"+str(epochs)+"epochs")
        train_time_epochs_start = time.time()   #time for each epochs
        total_train_loss = 0
        
        #train mode
        model.train()
        batch_no=0

        accuracy_list_train = []

        for batch in train_dataloader:
            batch_no+=1
            #batch to device
            batch_input, batch_mask, batch_label = tuple(v.to(device) for v in batch)
            model.zero_grad()       #zero gradident, reset it

            #forward pass, return result
            result = model(batch_input, batch_mask)

            #calculate loss from result and label
            batch_label = batch_label.to(torch.int64)     #to int64 or it crash
            loss = loss_function(result, batch_label)
            total_train_loss += loss.item()

            #backward pass
            loss.backward()
            #clip norm to 1.0, prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            #update parameter
            optimizer.step()
            #update learning rate
            scheduler.step()

            classified_result = torch.argmax(result, dim=1).flatten()
            #calculate the accuracy on batch train set
            accuracy = (classified_result == batch_label).cpu().numpy().mean() * 100
            accuracy_list_train.append(accuracy)

            if (batch_no % 250 == 0 and batch_no != 0):     #each 250 batch report once time
                print("batch number at:"+str(batch_no))
                time_used_in_batch = time.time()-train_time_start
                print("@"+str(time_used_in_batch)+"s from start training")
        
            
        average_loss = total_train_loss/len(train_dataloader)
        time_used = time.time()-train_time_epochs_start
        print("//////////////////")
        print("Average loss (train_set) = "+str(average_loss))
        print("Performance on training dataset: Accuracy = "+str(np.mean(accuracy_list_train)))
        print("Time (this epochs) = "+str(time_used))
        print("//////////////////")
    
    time_used = time.time()-train_time_start
    print("Training complete, time used = "+str(time_used))


        


In [10]:

def evaluate(model, val_dataloader):        #evalute the model
    model.eval()        #evaluate mode

    loss_list = []
    accuracy_list = []
    
    for i in val_dataloader:
        batch_input, batch_mask, batch_label = tuple(v.to(device) for v in i)

        #get result
        with torch.no_grad():
            result = model(batch_input, batch_mask)

        batch_label = batch_label.to(torch.int64)     #to int64 or it crash        

        #get loss
        loss  = loss_function(result, batch_label)
        loss_list.append(loss.item())

        #get classified label in number
        classified_result = torch.argmax(result, dim=1).flatten()
        #calculate the accuracy on batch val set
        accuracy = (classified_result == batch_label).cpu().numpy().mean() * 100
        accuracy_list.append(accuracy)

    #calculate and print loss and accuracy on valid set
    print("Performance on validation dataset: Accuracy = "+str(np.mean(accuracy_list))+", Loss = "+str(np.mean(loss_list)))



In [11]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=4)     #initializa model
train(bert_classifier, train_dataloader, epochs=4)                      #train model for 4 epochs



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


training initialized
##############################
1/4epochs
batch number at:250
@77.84821248054504s from start training
batch number at:500
@154.73544096946716s from start training
batch number at:750
@232.0023000240326s from start training
batch number at:1000
@309.53027153015137s from start training
//////////////////
Average loss (train_set) = 0.4546272170562297
Performance on training dataset: Accuracy = 86.0125
Time (this epochs) = 309.53027153015137
//////////////////
2/4epochs
batch number at:250
@387.10116839408875s from start training
batch number at:500
@465.68584275245667s from start training
batch number at:750
@543.1851859092712s from start training
batch number at:1000
@620.6781048774719s from start training
//////////////////
Average loss (train_set) = 0.14608690163097343
Performance on training dataset: Accuracy = 94.23125
Time (this epochs) = 311.1488342285156
//////////////////
3/4epochs
batch number at:250
@698.2098064422607s from start training
batch number at:500

In [12]:

#creat checkpoint
checkpoint_state = {'state_dict': bert_classifier.state_dict(), 'optimizer': optimizer.state_dict()}
torch.save(checkpoint_state, 'checkpoint.pt')

In [13]:
#if need load the checkpoint, uncomment below three lines
#loaded_checkpoint = torch.load('checkpoint.pt')
#bert_classifier.load_state_dict(loaded_checkpoint['state_dict'])
#optimizer.load_state_dict(loaded_checkpoint['optimizer'])

In [14]:
#evaluate by val_data
evaluate(bert_classifier, val_dataloader)

Performance on validation dataset: Accuracy = 93.95, Loss = 0.211594626782462
