In [1]:
# Libraries we need to install - If it is already installed you can skip this cell
!pip install sentencepiece
!pip install transformers



In [2]:
# Libraries 
#pretrained model
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import EarlyStoppingCallback

# base model
from torchtext.legacy.data import Field,LabelField,BucketIterator,TabularDataset
from torchtext import vocab
from tqdm import tqdm
import torch.nn.functional as F

#preprocessing and evaluation
import pandas as pd
import numpy as np
import os
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

2021-10-11 22:38:55.829091: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-11 22:38:55.829165: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [8]:
# Parameters

seed = 1234
np.random.seed(seed) 
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True  # cuda algorithms
os.environ['PYTHONHASHSEED'] = str(seed)
    
max_length = 512
model_name = 'bert' # model name (bert, alberta, distilbert or gpt2  for pretrained) (lstm, rnn, bilestm for base model)
model_path = "output-bert/checkpoint-500"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use 'cuda' if available else 'cpu'

In [5]:
# Load saved model based on your pretrained model
def test_bert_model(model_path, dataset):
    """ Test with bert pretrained model
    Input:
        model_path - path of saved pretrained model
    Returns:
        raw_pred list predictions of test dataset
    """
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    raw_pred, _, _ = test_trainer.predict(dataset) 
    
    return raw_pred

def test_bert_model_one_sentence(model_path,sentence,classes):
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred = np.argmax(probs.detach().numpy(), axis=1)
    
    return classes[pred]
    

def test_distilbert_model(model_path, dataset):
    """ Test with distilbert pretrained model
    Input:
        model_path - path of saved pretrained model
    Returns:
        raw_pred list predictions of test dataset
    """
    model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    raw_pred, _, _ = test_trainer.predict(dataset) 
    
    return raw_pred

def test_distilbert_model_one_Sentence(model_path, dataset):
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred = np.argmax(probs.detach().numpy(), axis=1)
    
    return classes[pred]


def test_alberta_model(model, dataset):
    """ Test with alberta pretrained model
    Input:
        model_path - path of saved pretrained model
    Returns:
        raw_pred list predictions of test dataset
    """
    model = AlbertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    raw_pred, _, _ = test_trainer.predict(dataset) 
    
    return raw_pred

def test_alberta_model_one_Sentence(model_path, dataset):
    tokenizer = AlbertTokenizer.from_pretrained(model_name)
    model = AlbertForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred = np.argmax(probs.detach().numpy(), axis=1)
    
    return classes[pred]


def test_gpt2_model(model_path, dataset):
    """ Test with alberta pretrained model
    Input:
        model_path - path of saved pretrained model
    Returns:
        raw_pred list predictions of test dataset
    """
    model = GPT2ForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    raw_pred, _, _ = test_trainer.predict(dataset) 
    
    return raw_pred

def test_gpt2_model_one_Sentence(model_path, dataset):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2ForSequenceClassification.from_pretrained(model_path, num_labels=8) 
    test_trainer = Trainer(model)
    
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred = np.argmax(probs.detach().numpy(), axis=1)
    
    return classes[pred]

In [9]:
#only test on one sentence

le = LabelEncoder()
le.classes_ = np.load('classes.npy', allow_pickle=True)

sentence = "it is a school"

if model_name == 'bert':
    label_sentence = test_bert_model_one_sentence(model_path, sentence, le.classes_)
elif model_name == 'alberta':
    label_sentence = test_alberta_model_one_sentence(model_path, sentence, le.classes_)
elif model_name == 'distilbert':
    label_sentence = test_distilbert_model_one_sentence(model_path, sentence, le.classes_)
elif model_name == 'gpt2':
    label_sentence = test_gpt2_model_one_sentence(model_path, sentence, le.classes_)
else:
    print('model is not defined')


loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /home/necva/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /home/necva/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /home/necva/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c

In [11]:
label_sentence[0] # predicted class

'Labeling'

In [None]:
# Read dataset
def read_dataset(file_path):
    """ Read dataset
    Input:
        file_path - string the path of the dataset
    Returns:
        train dataframe 
    """
    train_data = pd.read_excel(file_path, 'Sheet1')
    
    ''' Should/Must statement
        Should/must statement
        should/must statement labels are 
        converted to Should/Must statement
        
        personalizing is converted to Personalizing''' 
    
    
    train_data.loc[(train_data['label'] == 'should/must statement') | (train_data['label'] == 'Should/must statement')] = 'Should/Must statement' 
    train_data.loc[train_data['label'] == 'personalizing'] = 'Personalizing' 
    
    #Label encoding 
    
    le = LabelEncoder()
    train_data["label_encoded"] = le.fit_transform(train_data["label"]) 
    np.save('classes.npy', le.classes_)
    return train_data

In [None]:
# Only test your data on trained model without training phase - pretrained models
model_path = "path where you stored your model"
test_data_file = "path of the file"
test_dataset = read_dataset(test_data_file)
    
if model_name == 'bert':
    predictions = test_bert_model(model_path, test_dataset)
elif model_name == 'alberta':
    predictions = test_alberta_model(model_path, test_dataset)
elif model_name == 'distilbert':
    predictions = test_distilbert_model(model_path, test_dataset)
elif model_name == 'gpt2':
    predictions = test_gpt2_model(model_path, test_dataset)
else:
    print('model is not defined')
    
y_true = list(dataset["label_encoded"].values)
y = (predictions, y_true)
compute_metrics(y)