# Transformers Performance Comparison

This note intends to pick up the best transformers from the popular types by:
- Choosing the recommended parameters and comparing the accuracy

In [None]:
import warnings
# Disable 3 types of warning
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=(FutureWarning))
warnings.filterwarnings("ignore",category=(RuntimeWarning))

In [None]:
!pip3 install torch torchvision torchaudio

In [None]:
import torch
#Enable GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

RANDOM_STATE=1
#################### split data into train,dev,test##################
def train_dev_test(dataset,random_state=RANDOM_STATE):
    texts=list(dataset["original_text"])
    labels=list(dataset["label"])
    
    target_names = list(set(labels))
    label2idx = {label: idx for idx, label in enumerate(target_names)}
    print(label2idx)

    rest_texts, test_texts, rest_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=RANDOM_STATE)
    train_texts, dev_texts, train_labels, dev_labels = train_test_split(rest_texts, rest_labels, test_size=0.1, random_state=RANDOM_STATE)
    
    print("Train size:", len(train_texts))
    print("Dev size:", len(dev_texts))
    print("Test size:", len(test_texts))
    
    #Create dataframe for coming issue analysis
    df=pd.DataFrame()
    df['original_text']=train_texts+test_texts
    df['label']=train_labels+test_labels
    df['id']=df.index
    return df,(train_texts,dev_texts,test_texts),(train_labels,dev_labels,test_labels),(target_names,label2idx)

In [None]:
import logging

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

#### Prepare for data

In [None]:
import pandas as pd
# load train data
source_train_data_path="./01_data/WikiLarge_Train.csv"
source_train_data=pd.read_csv(source_train_data_path)

RANDOM_STATE=1
PORTION=0.2
size=round(len(source_train_data)*PORTION)
train_data=source_train_data.sample(n=size,random_state=RANDOM_STATE)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Plot histogram with the length. Truncate max length to 5000 tokens.
plt.style.use("ggplot")

plt.figure(figsize=(10, 8))
train_data['length'] = train_data['original_text'].apply(lambda x: len(x.split()))
sns.distplot(train_data[train_data['length'] < 5000]['length'])
plt.title('Frequence of sentences of a given length', fontsize=14)
plt.xlabel('length', fontsize=14)

In [None]:
# BERT token length should not be more than 512.
data_describe=train_data['original_text'].apply(lambda x: len(x.split())).describe()
print(data_describe)
MAX_SEQ_LENGTH=int(data_describe['max'])
if MAX_SEQ_LENGTH>512:
    MAX_SEQ_LENGTH=512

In [None]:
#Create train, dev, test data
df_init,(train_texts,dev_texts,test_texts),(train_labels,dev_labels,test_labels),(target_names,label2idx)=train_dev_test(train_data,random_state=RANDOM_STATE)



In [None]:
import numpy as np
#################### Both class and the following function are used to prepare for input items##################

class BertInputItem(object):
    """An item with all the necessary attributes for finetuning BERT."""

    def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
        self.text = text
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        

def convert_examples_to_inputs(example_texts, example_labels, label2idx, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    
    input_items = []
    examples = zip(example_texts, example_labels)
    for (ex_index, (text, label)) in enumerate(examples):

        # Create a list of token ids
        input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
        if len(input_ids) > max_seq_length:
            input_ids = input_ids[:max_seq_length]

        # All our tokens are in the first input segment (id 0).
        segment_ids = [0] * len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label2idx[label]

        input_items.append(
            BertInputItem(text=text,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_id=label_id))

        
    return input_items


In [None]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

#################### convert data for model input ##################

def get_data_loader(features, max_seq_length, batch_size, shuffle=True): 

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    sample_ids=torch.tensor([j for j in range(len(features))], dtype=torch.long) #identify each record
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,sample_ids)

    #dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    # dataloader tuning in https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html
   
    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size,num_workers=2,pin_memory=True)
    return dataloader

In [None]:
from tqdm import trange
from tqdm.notebook import tqdm

def evaluate(model, dataloader):
    model.eval()
    
    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids,_ = batch

        with torch.no_grad():
            #tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask,
            #                              token_type_ids=segment_ids, labels=label_ids)[:2]
            tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask,
                                         labels=label_ids)[:2]  # for distilbert
        outputs = np.argmax(logits.to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()
        
        predicted_labels += list(outputs)
        correct_labels += list(label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    
    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)
        
    return eval_loss, correct_labels, predicted_labels

In [None]:
import os
from tqdm import trange
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [None]:
!pip3 install transformers

In [None]:
from transformers import AdamW,get_linear_schedule_with_warmup
import os
from sklearn.metrics import classification_report, precision_recall_fscore_support

OUTPUT_DIR = "./tmp/"
MODEL_FILE_NAME = "pytorch_model.bin"

In [None]:
from transformers import DistilBertTokenizer,DistilBertForSequenceClassification,\
                         BertTokenizer, BertForSequenceClassification,\
                         RobertaTokenizer, RobertaForSequenceClassification,\
                         XLMRobertaTokenizer, XLMRobertaForSequenceClassification,\
                         AlbertTokenizer, AlbertForSequenceClassification

In [None]:
df_transformers=pd.DataFrame(columns=["MODEL","MODEL_NAME","TOKENIZER","CLASSIFIER","OUTPUT_DIR","MODEL_FILE_NAME","PARAMS"])

In [None]:
# DistilBert
model_name = "distilbert-base-uncased"
params={
        "GRADIENT_ACCUMULATION_STEPS":1,
        "NUM_TRAIN_EPOCHS":8,
        "LEARNING_RATE":2e-5,
        "WARMUP_PROPORTION":0.1,
        "MAX_GRAD_NORM":5,
        "MAX_SEQ_LENGTH":MAX_SEQ_LENGTH,
        "BATCH_SIZE":16,
        "NUM_WARMUP_STEPS":600
}
df_transformers=df_transformers.append({"MODEL":"DISTILBERT",
                                        "MODEL_NAME":model_name,
                                        "TOKENIZER":DistilBertTokenizer,
                                        "CLASSIFIER":DistilBertForSequenceClassification,
                                        "OUTPUT_DIR":OUTPUT_DIR,
                                        "MODEL_FILE_NAME":model_name+"_"+MODEL_FILE_NAME,
                                        "PARAMETERS":params
                                        },
                                       ignore_index=True)

In [None]:
# Bert-base
model_name = "bert-base-uncased"
params={
        "GRADIENT_ACCUMULATION_STEPS":1,
        "NUM_TRAIN_EPOCHS":3,
        "LEARNING_RATE":6e-4,
        "WARMUP_PROPORTION":0.1,
        "MAX_GRAD_NORM":5,
        "MAX_SEQ_LENGTH":MAX_SEQ_LENGTH,
        "BATCH_SIZE":16,
        "NUM_WARMUP_STEPS":24000
}
df_transformers=df_transformers.append({"MODEL":"BERT",
                                        "MODEL_NAME":model_name,
                                        "TOKENIZER":BertTokenizer,
                                        "CLASSIFIER":BertForSequenceClassification,
                                        "OUTPUT_DIR":OUTPUT_DIR,
                                        "MODEL_FILE_NAME":model_name+"_"+MODEL_FILE_NAME,
                                        "PARAMETERS":params
                                        },
                                       ignore_index=True)

In [None]:
# Bert-large
model_name = "bert-large-uncased"
params={
        "GRADIENT_ACCUMULATION_STEPS":1,
        "NUM_TRAIN_EPOCHS":3,
        "LEARNING_RATE":6e-4,
        "WARMUP_PROPORTION":0.1,
        "MAX_GRAD_NORM":5,
        "MAX_SEQ_LENGTH":MAX_SEQ_LENGTH,
        "BATCH_SIZE":16,
        "NUM_WARMUP_STEPS":24000
}
df_transformers=df_transformers.append({"MODEL":"BERT-LARGE",
                                        "MODEL_NAME":model_name,
                                        "TOKENIZER":BertTokenizer,
                                        "CLASSIFIER":BertForSequenceClassification,
                                        "OUTPUT_DIR":OUTPUT_DIR,
                                        "MODEL_FILE_NAME":model_name+"_"+MODEL_FILE_NAME,
                                        "PARAMETERS":params
                                        },
                                       ignore_index=True)

In [None]:
#Robertabert
model_name = "roberta-base"
params={
        "GRADIENT_ACCUMULATION_STEPS":1,
        "NUM_TRAIN_EPOCHS":3,
        "LEARNING_RATE":6e-4,
        "WARMUP_PROPORTION":0.1,
        "MAX_GRAD_NORM":5,
        "MAX_SEQ_LENGTH":MAX_SEQ_LENGTH,
        "BATCH_SIZE":16,
        "NUM_WARMUP_STEPS":24000
}
df_transformers=df_transformers.append({"MODEL":"ROBERTA",
                                        "MODEL_NAME":model_name,
                                        "TOKENIZER":RobertaTokenizer,
                                        "CLASSIFIER":RobertaForSequenceClassification,
                                        "OUTPUT_DIR":OUTPUT_DIR,
                                        "MODEL_FILE_NAME":model_name+"_"+MODEL_FILE_NAME,
                                        "PARAMETERS":params
                                        },
                                       ignore_index=True)

In [None]:
#XLMRobertabert
model_name = "xlm-roberta-large"
params={
        "GRADIENT_ACCUMULATION_STEPS":1,
        "NUM_TRAIN_EPOCHS":3,
        "LEARNING_RATE":6e-4,
        "WARMUP_PROPORTION":0.1,
        "MAX_GRAD_NORM":5,
        "MAX_SEQ_LENGTH":MAX_SEQ_LENGTH,
        "BATCH_SIZE":16,
        "NUM_WARMUP_STEPS":24000
}
df_transformers=df_transformers.append({"MODEL":"XLMROBERTA",
                                        "MODEL_NAME":model_name,
                                        "TOKENIZER":XLMRobertaTokenizer,
                                        "CLASSIFIER":XLMRobertaForSequenceClassification,
                                        "OUTPUT_DIR":OUTPUT_DIR,
                                        "MODEL_FILE_NAME":model_name+"_"+MODEL_FILE_NAME,
                                        "PARAMETERS":params
                                        },
                                       ignore_index=True)

In [None]:
#Albert
model_name = "albert-base-v2"
params={
        "GRADIENT_ACCUMULATION_STEPS":1,
        "NUM_TRAIN_EPOCHS":3,
        "LEARNING_RATE":6e-4,
        "WARMUP_PROPORTION":0.1,
        "MAX_GRAD_NORM":5,
        "MAX_SEQ_LENGTH":MAX_SEQ_LENGTH,
        "BATCH_SIZE":16,
        "NUM_WARMUP_STEPS":24000
}
df_transformers=df_transformers.append({"MODEL":"ALBERT",
                                        "MODEL_NAME":model_name,
                                        "TOKENIZER":AlbertTokenizer,
                                        "CLASSIFIER":AlbertForSequenceClassification,
                                        "OUTPUT_DIR":OUTPUT_DIR,
                                        "MODEL_FILE_NAME":model_name+"_"+MODEL_FILE_NAME,
                                        "PARAMETERS":params
                                        },
                                       ignore_index=True)

In [None]:
!pip3 install sentencepiece

In [None]:
## Initialize bert model  
def train(ts,train_texts,train_labels,dev_texts,dev_labels,target_names,label2idx):
    
    tokenizer =ts['TOKENIZER'].from_pretrained(ts['MODEL_NAME'],target_names=target_names)
    # Using trained model
    model=ts['CLASSIFIER'].from_pretrained(ts['MODEL_NAME'],num_labels = len(target_names),
                                                                output_attentions = False,
                                                                output_hidden_states = False)     
    ## Prepare for data loading and parameter setting for bert model
    train_features = convert_examples_to_inputs(train_texts,train_labels, label2idx, ts['PARAMETERS']['MAX_SEQ_LENGTH'], tokenizer)
    train_dataloader = get_data_loader(train_features, ts['PARAMETERS']['MAX_SEQ_LENGTH'], ts['PARAMETERS']['BATCH_SIZE'], shuffle=False)
    dev_features = convert_examples_to_inputs(dev_texts,dev_labels, label2idx, ts['PARAMETERS']['MAX_SEQ_LENGTH'], tokenizer)
    dev_dataloader = get_data_loader(dev_features, ts['PARAMETERS']['MAX_SEQ_LENGTH'], ts['PARAMETERS']['BATCH_SIZE'], shuffle=True)

    num_train_steps = int(len(train_dataloader.dataset) / ts['PARAMETERS']['BATCH_SIZE'] /ts['PARAMETERS']['GRADIENT_ACCUMULATION_STEPS'] * params['NUM_TRAIN_EPOCHS'])
    num_warmup_steps = ts['PARAMETERS']['NUM_WARMUP_STEPS']

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=ts['PARAMETERS']['LEARNING_RATE'], correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps,num_training_steps=num_train_steps)

    ##Enable GPU if has
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    ## Start to training 
    torch.backends.cudnn.benchmark = True # tuning guide:https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html

    loss_history = []
    no_improvement = 0
    PATIENCE=2
    for _ in trange(int(ts['PARAMETERS']["NUM_TRAIN_EPOCHS"]), desc="Epoch"):
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        #j=0
        #sample_ids=[]
        for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids,sample_ids = batch

            if ts['MODEL']!="DISTILBERT":
                outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) # non-distillbert
            else:
                outputs = model(input_ids, attention_mask=input_mask,labels=label_ids)

            #if aum:
            #    records = aum_calculator.update(outputs[1], label_ids,sample_ids.tolist())

            loss = outputs[0]

            if ts['PARAMETERS']['GRADIENT_ACCUMULATION_STEPS'] > 1:
                loss = loss / ts['PARAMETERS']['GRADIENT_ACCUMULATION_STEPS']

            loss.backward()
            tr_loss += loss.item()

            if (step + 1) % ts['PARAMETERS']['GRADIENT_ACCUMULATION_STEPS'] == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),ts['PARAMETERS']['MAX_GRAD_NORM'])  

                optimizer.step()
                optimizer.zero_grad(set_to_none=True)
                scheduler.step()
        dev_loss, _, _ = evaluate(model, dev_dataloader)
         #print("Dev loss:", dev_loss)

        print("Loss history:", loss_history)
        print("Dev loss:", dev_loss)

        if len(loss_history) == 0 or dev_loss < min(loss_history):
            no_improvement = 0
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(ts['OUTPUT_DIR'], ts['MODEL_FILE_NAME'])
            torch.save(model_to_save.state_dict(), output_model_file)
        else:
            no_improvement += 1

        if no_improvement >= PATIENCE: 
            print("No improvement on development set. Finish training.")
            break


        loss_history.append(dev_loss)
    

In [None]:
# Evaluate the dataset based on trained distilbert model
def data_evaluation(ts,texts,labels,target_names,label2idx):
    # Convert test data of submission to features
    #target_names = list(set(labels))
    #label2idx = {label: idx for idx, label in enumerate(target_names)}

    # Enable GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Select bert model
    #BERT_MODEL = "distilbert-base-uncased"
    tokenizer = ts['TOKENIZER'].from_pretrained(ts['MODEL_NAME'])

    # Using trained model
    model_state_dict = torch.load(os.path.join(ts['OUTPUT_DIR'], ts['MODEL_FILE_NAME']),
                                  map_location=lambda storage, loc: storage)
    model=ts['CLASSIFIER'].from_pretrained(ts['MODEL_NAME'], state_dict=model_state_dict, num_labels = len(target_names),
                                                                    output_attentions = False,
                                                                    output_hidden_states = False)    
    model.to(device)

    # Convert text and labels to embeddings 
    features = convert_examples_to_inputs(texts, labels, label2idx,  ts['PARAMETERS']['MAX_SEQ_LENGTH'], tokenizer)
    dataloader = get_data_loader(features, ts['PARAMETERS']['MAX_SEQ_LENGTH'], ts['PARAMETERS']['BATCH_SIZE'], shuffle=False)

    # Predict the result, and discard the evaluatoin result, only take the prediction result.
    _, correct, predicted = evaluate(model, dataloader)
    print("Errors performance:", precision_recall_fscore_support(correct, predicted, average="micro"))

    bert_accuracy = np.mean(predicted == correct)

    #print(round(bert_accuracy,2))
    print(classification_report(correct, predicted))
    return round(bert_accuracy,2)

In [None]:
df_acc=pd.DataFrame(columns=["Model","Accuracy"])
for i in range(len(df_transformers)):
    train(df_transformers.iloc[i],train_texts,train_labels,dev_texts,dev_labels,target_names,label2idx)
    accuracy=data_evaluation(df_transformers.iloc[i],test_texts,test_labels,target_names,label2idx)
    df_acc=df_acc.append({"Model":df_transformers.iloc[i]["MODEL"],
                          "Accuracy":accuracy},
                          ignore_index=True)

In [None]:
import random
# Loading the test data for submission
sub_data_path="./01_data/WikiLarge_Test.csv"
sub_data=pd.read_csv(sub_data_path)

sub_texts=list(sub_data["original_text"])
sub_labels=[random.choice([0,1]) for i in range(len(sub_texts))]

print("Submission Test size:", len(sub_data))

In [None]:
_,sub_predicted,_=data_evaluation(sub_texts,sub_labels,model_name,params,trained=True,OUTPUT_DIR = OUTPUT_DIR, MODEL_FILE_NAME = MODEL_FILE_NAME)

In [None]:
# Produce the submission file
df_sub=pd.DataFrame(columns=["id","label"])
df_sub['label']=sub_predicted
df_sub['id']=[i for i in range(len(sub_predicted))]
df_sub.to_csv("./tmp/submission.csv",index=False)