In [1]:
import torch, io, pickle, time, warnings, os
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from transformers import *
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
from collections import Counter
from joblib import Parallel, delayed

warnings.filterwarnings("ignore")

2021-10-30 21:26:35.562682: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
print(torch.cuda.get_device_name(0))

Quadro RTX 6000


### Develop model using UCF-Training

In [3]:
train_file_path='../../dataset/UCF/train/'
df_UCF_train=pd.concat([pd.read_pickle(train_file_path+file, compression='gzip') for file in os.listdir(train_file_path)], ignore_index=True)
df_UCF_train['input']= df_UCF_train['TAXPAYER_NAME']+' '+df_UCF_train['mission_spellchk']+' '+df_UCF_train['prgrm_dsc_spellchk']

In [4]:
df_UCF_train.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,...,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before,NTEE1,mission,prgrm_dsc,mission_spellchk,prgrm_dsc_spellchk,input
50006,93492280000000.0,562519347,EFILE,FUNDS RAISED FOR LOCAL YOUTH AND SPORTS ACTIVI...,PROJECTS TO ENHANCE COMMUNITY LIVING,,,,,,...,PARKROSE COMMUNITY FOUNDATION,201412.0,2016.0,0.0,O,PROJECTS TO ENHANCE COMMUNITY LIVING,FUNDS RAISED FOR YOUTH ACTIVITIES; FUNDS RAISE...,PROJECTS TO ENHANCE COMMUNITY LIVING,FUNDS RAISED FOR YOUTH ACTIVITIES ; FUNDS RAIS...,PARKROSE COMMUNITY FOUNDATION PROJECTS TO ENHA...
17535,93493320000000.0,900360866,EFILE,,,,,,"PROMOTION OF CONVENTIONS, EVENTS & TOURISM FOR...",THE BUREAU HAS CONTINUED AND ENHANCED ITS EFFO...,...,GREAT FALLS CONVENTION AND VISITORS B,201406.0,2014.0,0.0,S,"PROMOTION OF CONVENTIONS, EVENTS & TOURISM FOR...",THE BUREAU HAS CONTINUED AND ENHANCED ITS EFFO...,"PROMOTION OF CONVENTIONS , EVENTS & TOURISM FO...",THE BUREAU HAS CONTINUED AND ENHANCED ITS EFFO...,GREAT FALLS CONVENTION AND VISITORS B PROMOTIO...
86211,93493320000000.0,310981847,EFILE,,,,,CHARITABLE HOSPITAL,CHARITABLE ALUMNI ASSOCIATION,SUPPORTED THE EFFORTS OF THE AAPI CHARITABLE F...,...,AMRITSAR MEDICAL AND DENTAL ALUMNI ASSOC OF NO...,201312.0,2014.0,1.0,B,CHARITABLE ALUMNI ASSOCIATION,CHARITABLE HOSPITAL; SUPPORTED THE EFFORTS OF ...,CHARITABLE ALUMNI ASSOCIATION,CHARITABLE HOSPITAL ; SUPPORTED THE EFFORTS OF...,AMRITSAR MEDICAL AND DENTAL ALUMNI ASSOC OF NO...


In [5]:
# Code as 10 broad categories.
broad_cat_dict={'I': ['A'],
                'II': ['B'],
                'III': ['C', 'D'],
                'IV': ['E', 'F', 'G', 'H'],
                'V': ['I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'],
                'VI': ['Q'],
                'VII': ['R', 'S', 'T', 'U', 'V', 'W'],
                'VIII': ['X'],
                'IX': ['Y'],
                'X': ['Z'],
               }
def ntee2cat(string):
    global broad_cat_dict
    return [s for s in broad_cat_dict.keys() if string in broad_cat_dict[s]][0]

df_UCF_train['broad_cat']=df_UCF_train['NTEE1'].apply(ntee2cat)

In [6]:
# Create sentence and encoded label lists
sentences = df_UCF_train.input.values
# labels should be numbers AND continious.
# "input for criterion should satisfy t >= 0 && t < n_classes"
# https://github.com/pytorch/pytorch/issues/1204#issuecomment-326958795
labels = preprocessing.LabelEncoder().fit_transform(df_UCF_train.broad_cat.values)
labels = torch.tensor(labels)

In [44]:
# Save LabelEncoder class for developing package.
import pickle
with open('../../output/le_broad_cat.pkl', 'wb') as output:
    pickle.dump(preprocessing.LabelEncoder().fit(df_UCF_train.broad_cat.values), output, pickle.HIGHEST_PROTOCOL)

In [17]:
# Import BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

def func_encode_string(sent):
    encoded_dict = tokenizer.encode_plus(sent,                      # Sentence to encode.
                                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                        truncation='longest_first', padding='max_length', # Max length accepted by model.
                                        return_attention_mask = True,   # Construct attn. masks.
                                        return_tensors = 'pt',     # Return pytorch tensors.
                                               )
    return encoded_dict

encoded_outputs=Parallel(n_jobs=-1, backend="multiprocessing", batch_size='auto', verbose=1)(delayed(func_encode_string)(sent) for sent in sentences)
for encoded_output in encoded_outputs:
    # Add the encoded sentence to the list.
    input_ids.append(encoded_output['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_output['attention_mask'])
    
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print('Length of encoding:', len(input_ids[0]))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 912 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 5888 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 13088 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 21888 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 32288 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 44288 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 57888 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 73088 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 89888 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 108288 tasks      | elapsed:   50.2s
[Parallel(n_jobs=-1)]: Done 128288 tasks      | elapsed:   58.7s
[Parallel(n_jobs=-1)]: Done 149888 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 154424 out of 154424 | elapsed:  1.2min finished


Original:  SINGING RIVER EDUCATION ASSOCIATION PROVIDE CHILD CARE SERVICES TO DISADVANTAGED CHILDREN . THE ORGANIZATION PROVIDES COMPLETE CHILD CARE SERVICES EMPHASIZING A QUALITY EDUCATION , HEALTH AND NUTRITION PROGRAM FOR DISADVANTAGED CHILDREN AGES THREE THROUGH FIVE IN GEORGE CO. , MISSISSIPPI .
Token IDs: tensor([  101,  4823,  2314,  2495,  2523,  3073,  2775,  2729,  2578,  2000,
        27322,  2336,  1012,  1996,  3029,  3640,  3143,  2775,  2729,  2578,
        22671,  1037,  3737,  2495,  1010,  2740,  1998, 14266,  2565,  2005,
        27322,  2336,  5535,  2093,  2083,  2274,  1999,  2577,  2522,  1012,
         1010,  5900,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,

In [18]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

In [19]:
len(validation_inputs), len(validation_labels), len(validation_masks), len(train_inputs)

(15443, 15443, 15443, 138981)

In [20]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 16 # Smaller use less GPU RAM.

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [21]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels=len(df_UCF_train.broad_cat.unique()), # The number of output labels--2 for binary classification.
                                                 # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### Create optimizer

In [22]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [23]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [24]:
import numpy as np
import time
import datetime

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [25]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 1000 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch 1,000  of  8,687.    Elapsed: 0:08:50.
  Batch 2,000  of  8,687.    Elapsed: 0:17:44.
  Batch 3,000  of  8,687.    Elapsed: 0:26:38.
  Batch 4,000  of  8,687.    Elapsed: 0:35:39.
  Batch 5,000  of  8,687.    Elapsed: 0:44:37.
  Batch 6,000  of  8,687.    Elapsed: 0:53:31.
  Batch 7,000  of  8,687.    Elapsed: 1:02:25.
  Batch 8,000  of  8,687.    Elapsed: 1:11:18.

  Average training loss: 0.43
  Training epcoh took: 1:17:24

Running Validation...
  Accuracy: 0.89
  Validation took: 0:02:57

Training...
  Batch 1,000  of  8,687.    Elapsed: 0:08:53.
  Batch 2,000  of  8,687.    Elapsed: 0:17:47.
  Batch 3,000  of  8,687.    Elapsed: 0:26:41.
  Batch 4,000  of  8,687.    Elapsed: 0:35:35.
  Batch 5,000  of  8,687.    Elapsed: 0:44:29.
  Batch 6,000  of  8,687.    Elapsed: 0:53:23.
  Batch 7,000  of  8,687.    Elapsed: 1:02:19.
  Batch 8,000  of  8,687.    Elapsed: 1:11:13.

  Average training loss: 0.28
  Training epcoh took: 1:17:19

Running Validation...
  Accura

### Evaluate on UCF-Test

In [26]:
eval_file_path='../../dataset/UCF/test/'
df_UCF_eval=pd.concat([pd.read_pickle(eval_file_path+file, compression='gzip') for file in os.listdir(eval_file_path)], ignore_index=True)
df_UCF_eval['input']= df_UCF_eval['TAXPAYER_NAME']+' '+df_UCF_eval['mission_spellchk']+' '+df_UCF_eval['prgrm_dsc_spellchk']

In [27]:
# Code as 10 broad categories.
broad_cat_dict={'I': ['A'],
                'II': ['B'],
                'III': ['C', 'D'],
                'IV': ['E', 'F', 'G', 'H'],
                'V': ['I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'],
                'VI': ['Q'],
                'VII': ['R', 'S', 'T', 'U', 'V', 'W'],
                'VIII': ['X'],
                'IX': ['Y'],
                'X': ['Z'],
               }
def ntee2cat(string):
    global broad_cat_dict
    return [s for s in broad_cat_dict.keys() if string in broad_cat_dict[s]][0]

df_UCF_eval['broad_cat']=df_UCF_eval['NTEE1'].apply(ntee2cat)

In [21]:
# Create sentence and encoded label lists
sentences = df_UCF_eval.input.values
# labels should be numbers AND continious.
# "input for criterion should satisfy t >= 0 && t < n_classes"
# https://github.com/pytorch/pytorch/issues/1204#issuecomment-326958795
labels = preprocessing.LabelEncoder().fit_transform(df_UCF_eval.broad_cat.values)
labels = torch.tensor(labels)

In [38]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

def func_encode_string(sent):
    encoded_dict = tokenizer.encode_plus(sent,                      # Sentence to encode.
                                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                        truncation='longest_first',
                                        padding='max_length',
                #                         max_length = 256,           # Pad & truncate all sentences.
                                        return_attention_mask = True,   # Construct attn. masks.
                                        return_tensors = 'pt',     # Return pytorch tensors.
                                               )
    return encoded_dict

encoded_outputs=Parallel(n_jobs=-1, backend="multiprocessing", batch_size='auto', verbose=1)(delayed(func_encode_string)(sent) for sent in sentences)
for encoded_output in encoded_outputs:
    # Add the encoded sentence to the list.
    input_ids.append(encoded_output['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_output['attention_mask'])
    
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print('Length of encoding:', len(input_ids[0]))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 115 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 6560 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 13760 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 22560 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 32960 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 38607 out of 38607 | elapsed:   15.7s finished


Original:  GREENCROFT GOSHEN INC GREENCROFT GOSDEN PROVIDES ACTIVE , AFFORDABLE RETIREMENT LIVING WITH SUPPORTIVE SERVICES , ASSISTED LIVING , AND SKILLED NURSING CARE . CHARITY CARE FOR EYE JUNE 30 , 2014 AMOUNTED TO $ 4E2793922 . NURSING SERVICES - GREENCROFT GOSDEN IS HEALTH FACILITIES PROVIDE STATE-OF-THE-ART HEALTH CARE TECHNIQUES AND TECHNOLOGY IN AN INVITING AND SUPPORTIVE RESIDENTIAL ENVIRONMENT . OUR SETTING IS APPROPRIATE FOR INDIVIDUALS WHO ARE RECOVERING FROM SURGERY OR ILLNESS , NEED REHABILITATION , OR REQUIRE ONGOING LONG-TERM NURSING CARE . OUR RESTORATIVE CARE PHILOSOPHY REINFORCES THE HEALING AND REHABILITATION PROCESS AND PROMOTES INDEPENDENCE . A FULL RANGE OF ACTIVITIES , CHAPLAINCY , AND SOCIAL SERVICES ARE PROVIDED . WE PROVIDE OUTPATIENT PHYSICAL , OCCUPATIONAL , AND SPEECH/LANGUAGE THERAPIES FOR ALL AGES . WE EMPHASIZE EDUCATION AND COMMUNICATION WITH YOU , YOUR FAMILY , AND REFERRING PHYSICIANS . RESIDENTS WHO DO NOT MISUSE THEIR ASSETS MAY BE ELIGIBLE FOR SUP

In [37]:
# Create sentence and encoded label lists
sentences = df_UCF_eval.input.values
# labels should be numbers AND continious.
# "input for criterion should satisfy t >= 0 && t < n_classes"
# https://github.com/pytorch/pytorch/issues/1204#issuecomment-326958795
labels = preprocessing.LabelEncoder().fit_transform(df_UCF_eval.broad_cat.values)
labels = torch.tensor(labels)

In [39]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 20 # Smaller use less GPU RAM.

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_data = TensorDataset(input_ids, attention_masks, labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [40]:
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
logits_all=[]
label_ids_all=[]

# Evaluate data for one epoch
for batch in tqdm(validation_dataloader):

    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Telling the model not to compute or store gradients, saving memory and
    # speeding up validation
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # Get the "logits" output by the model. The "logits" are the output
    # values prior to applying an activation function like the softmax.
    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    logits_all+=list(np.argmax(logits, axis=1))
    label_ids = b_labels.to('cpu').numpy()
    label_ids_all+=list(label_ids)

    # Calculate the accuracy for this batch of test sentences.
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    # Accumulate the total accuracy.
    eval_accuracy += tmp_eval_accuracy

    # Track the number of batches
    nb_eval_steps += 1

# Report the final accuracy for this validation run.
print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("  Validation took: {:}".format(format_time(time.time() - t0)))

100%|██████████| 2413/2413 [07:23<00:00,  5.45it/s]

  Accuracy: 0.90
  Validation took: 2:18:52





In [41]:
logits_all_letter=preprocessing.LabelEncoder().fit(df_UCF_train.broad_cat.values).inverse_transform(logits_all)
label_ids_all_letter=preprocessing.LabelEncoder().fit(df_UCF_train.broad_cat.values).inverse_transform(label_ids_all)

In [30]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=label_ids_all_letter, y_pred=logits_all_letter, digits=4))

                   pre       rec       spe        f1       geo       iba       sup

          I     0.9220    0.9170    0.9903    0.9195    0.9530    0.9015      4291
         II     0.9145    0.9084    0.9831    0.9114    0.9450    0.8863      6419
        III     0.8968    0.9151    0.9947    0.9059    0.9541    0.9030      1861
         IV     0.8989    0.8847    0.9874    0.8917    0.9347    0.8646      4329
         IX     0.9091    0.9353    0.9957    0.9221    0.9650    0.9257      1701
          V     0.9034    0.9176    0.9572    0.9105    0.9372    0.8749     11723
         VI     0.6742    0.6835    0.9962    0.6788    0.8252    0.6596       436
        VII     0.9047    0.8822    0.9803    0.8933    0.9300    0.8564      6749
       VIII     0.8166    0.8352    0.9945    0.8258    0.9114    0.8173      1098

avg / total     0.9019    0.9018    0.9776    0.9018    0.9387    0.8749     38607



In [42]:
from imblearn.metrics import classification_report_imbalanced
print('========Correct label# error results, 2021-10-30========')
print(classification_report_imbalanced(y_true=label_ids_all_letter, y_pred=logits_all_letter, digits=4))
print(Counter(logits_all_letter))

                   pre       rec       spe        f1       geo       iba       sup

          I     0.9164    0.9173    0.9895    0.9168    0.9527    0.9011      4291
         II     0.9179    0.9056    0.9838    0.9117    0.9439    0.8840      6419
        III     0.8866    0.9199    0.9940    0.9030    0.9563    0.9077      1861
         IV     0.8962    0.8880    0.9870    0.8921    0.9362    0.8678      4329
         IX     0.9129    0.9300    0.9959    0.9214    0.9624    0.9201      1701
          V     0.9050    0.9143    0.9582    0.9096    0.9360    0.8722     11723
         VI     0.6744    0.6651    0.9963    0.6697    0.8141    0.6407       436
        VII     0.8994    0.8866    0.9790    0.8930    0.9317    0.8600      6749
       VIII     0.8145    0.8160    0.9946    0.8153    0.9009    0.7971      1098

avg / total     0.9007    0.9007    0.9777    0.9007    0.9382    0.8738     38607

Counter({'V': 11843, 'VII': 6653, 'II': 6333, 'I': 4295, 'IV': 4289, 'III': 1931, 'I

---
**Results below are from previous label# error session**

In [31]:
Counter(logits_all_letter)

Counter({'V': 11907,
         'I': 4268,
         'IX': 1750,
         'IV': 4261,
         'III': 1899,
         'VII': 6581,
         'VIII': 1123,
         'II': 6376,
         'VI': 442})

In [33]:
t=pd.DataFrame([logits_all_letter, label_ids_all_letter]).T.rename(columns={0:'pred', 1:'true'})
len(t[t.pred==t.true])/len(t)

0.9018053720827829

## Save model

In [43]:
dir_path='../../../bert_model/npo_classifier_bc/'
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(dir_path)  # save
tokenizer.save_pretrained(dir_path)  # save

('../../../bert_model/npo_classifier_bc/tokenizer_config.json',
 '../../../bert_model/npo_classifier_bc/special_tokens_map.json',
 '../../../bert_model/npo_classifier_bc/vocab.txt',
 '../../../bert_model/npo_classifier_bc/added_tokens.json')

## Test saved model

### Load model

In [3]:
dir_path='../../../bert_model/npo_classifier_bc/'
model_loaded = BertForSequenceClassification.from_pretrained(dir_path)  # re-load
tokenizer_loaded = BertTokenizer.from_pretrained(dir_path)  # re-load
# Read label encoder.
with open('../../output/le_broad_cat.pkl', 'rb') as le_broad_cat_pkl:
    le_broad_cat = pickle.load(le_broad_cat_pkl)

### Test on `nteeConf==A`

#### Load file

In [4]:
df_UCF_eval=pd.read_pickle('../../dataset/UCF/test/df_ucf_test.pkl.gz')
df_UCF_eval['input']= df_UCF_eval['TAXPAYER_NAME']+' '+df_UCF_eval['mission_spellchk']+' '+df_UCF_eval['prgrm_dsc_spellchk']

# Code as 10 broad categories.
broad_cat_dict={'I': ['A'],
                'II': ['B'],
                'III': ['C', 'D'],
                'IV': ['E', 'F', 'G', 'H'],
                'V': ['I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'],
                'VI': ['Q'],
                'VII': ['R', 'S', 'T', 'U', 'V', 'W'],
                'VIII': ['X'],
                'IX': ['Y'],
                'X': ['Z'],
               }
def ntee2cat(string):
    global broad_cat_dict
    return [s for s in broad_cat_dict.keys() if string in broad_cat_dict[s]][0]

df_UCF_eval['broad_cat']=df_UCF_eval['NTEE1'].apply(ntee2cat)

# Create sentence and encoded label lists
sentences = df_UCF_eval.input.values
# labels should be numbers AND continious.
# "input for criterion should satisfy t >= 0 && t < n_classes"
# https://github.com/pytorch/pytorch/issues/1204#issuecomment-326958795
labels = preprocessing.LabelEncoder().fit_transform(df_UCF_eval.broad_cat.values)
labels = torch.tensor(labels)

In [11]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

def func_encode_string(sent):
    encoded_dict = tokenizer_loaded.encode_plus(sent,                      # Sentence to encode.
                                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                        truncation='longest_first',
                                        padding='max_length',
                #                         max_length = 256,           # Pad & truncate all sentences.
                                        return_attention_mask = True,   # Construct attn. masks.
                                        return_tensors = 'pt',     # Return pytorch tensors.
                                               )
    return encoded_dict

encoded_outputs=Parallel(n_jobs=-1, backend="multiprocessing", batch_size='auto', verbose=1)(delayed(func_encode_string)(sent) for sent in sentences)
for encoded_output in encoded_outputs:
    # Add the encoded sentence to the list.
    input_ids.append(encoded_output['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_output['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print('Length of encoding:', len(input_ids[0]))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1200 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 6560 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 13760 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 22560 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 32960 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 38607 out of 38607 | elapsed:   17.2s finished


Original:  GREENCROFT GOSHEN INC GREENCROFT GOSDEN PROVIDES ACTIVE , AFFORDABLE RETIREMENT LIVING WITH SUPPORTIVE SERVICES , ASSISTED LIVING , AND SKILLED NURSING CARE . CHARITY CARE FOR EYE JUNE 30 , 2014 AMOUNTED TO $ 4E2793922 . NURSING SERVICES - GREENCROFT GOSDEN IS HEALTH FACILITIES PROVIDE STATE-OF-THE-ART HEALTH CARE TECHNIQUES AND TECHNOLOGY IN AN INVITING AND SUPPORTIVE RESIDENTIAL ENVIRONMENT . OUR SETTING IS APPROPRIATE FOR INDIVIDUALS WHO ARE RECOVERING FROM SURGERY OR ILLNESS , NEED REHABILITATION , OR REQUIRE ONGOING LONG-TERM NURSING CARE . OUR RESTORATIVE CARE PHILOSOPHY REINFORCES THE HEALING AND REHABILITATION PROCESS AND PROMOTES INDEPENDENCE . A FULL RANGE OF ACTIVITIES , CHAPLAINCY , AND SOCIAL SERVICES ARE PROVIDED . WE PROVIDE OUTPATIENT PHYSICAL , OCCUPATIONAL , AND SPEECH/LANGUAGE THERAPIES FOR ALL AGES . WE EMPHASIZE EDUCATION AND COMMUNICATION WITH YOU , YOUR FAMILY , AND REFERRING PHYSICIANS . RESIDENTS WHO DO NOT MISUSE THEIR ASSETS MAY BE ELIGIBLE FOR SUP

In [18]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 320

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_data = TensorDataset(input_ids, attention_masks, labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

#### Start to predict

In [19]:
### Set up environment.

import numpy as np
import time
import datetime
import random

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [20]:
# Put the model_loaded in evaluation mode--the dropout layers behave differently
# during evaluation.
model_loaded.cuda()
model_loaded.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
logits_all=[]
label_ids_all=[]

# Measure how long the training epoch takes.
t0 = time.time()

# Evaluate data for one epoch
for batch in tqdm(validation_dataloader):

    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Telling the model_loaded not to compute or store gradients, saving memory and
    # speeding up validation
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model_loaded` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_loaded_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model_loaded(b_input_ids, 
                               token_type_ids=None, 
                               attention_mask=b_input_mask)

    # Get the "logits" output by the model_loaded. The "logits" are the output
    # values prior to applying an activation function like the softmax.
    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    logits_all+=list(np.argmax(logits, axis=1))
    label_ids = b_labels.to('cpu').numpy()
    label_ids_all+=list(label_ids)

    # Calculate the accuracy for this batch of test sentences.
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    # Accumulate the total accuracy.
    eval_accuracy += tmp_eval_accuracy

    # Track the number of batches
    nb_eval_steps += 1

# Report the final accuracy for this validation run.
print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("  Validation took: {:}".format(format_time(time.time() - t0)))

100%|██████████| 121/121 [07:04<00:00,  3.51s/it]

  Accuracy: 0.90
  Validation took: 0:07:04





In [21]:
logits_all_letter=le_broad_cat.inverse_transform(logits_all)
label_ids_all_letter=le_broad_cat.inverse_transform(label_ids_all)

In [22]:
from imblearn.metrics import classification_report_imbalanced
print('========Correct label# error results, 2021-10-30========')
print(classification_report_imbalanced(y_true=label_ids_all_letter, y_pred=logits_all_letter, digits=4))
print(Counter(logits_all_letter))

                   pre       rec       spe        f1       geo       iba       sup

          I     0.9164    0.9173    0.9895    0.9168    0.9527    0.9011      4291
         II     0.9179    0.9056    0.9838    0.9117    0.9439    0.8840      6419
        III     0.8866    0.9199    0.9940    0.9030    0.9563    0.9077      1861
         IV     0.8962    0.8880    0.9870    0.8921    0.9362    0.8678      4329
         IX     0.9129    0.9300    0.9959    0.9214    0.9624    0.9201      1701
          V     0.9050    0.9143    0.9582    0.9096    0.9360    0.8722     11723
         VI     0.6744    0.6651    0.9963    0.6697    0.8141    0.6407       436
        VII     0.8994    0.8866    0.9790    0.8930    0.9317    0.8600      6749
       VIII     0.8145    0.8160    0.9946    0.8153    0.9009    0.7971      1098

avg / total     0.9007    0.9007    0.9777    0.9007    0.9382    0.8738     38607

Counter({'V': 11843, 'VII': 6653, 'II': 6333, 'I': 4295, 'IV': 4289, 'III': 1931, 'I

In [17]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=label_ids_all_letter, y_pred=logits_all_letter, digits=4))

                   pre       rec       spe        f1       geo       iba       sup

          I     0.9220    0.9170    0.9903    0.9195    0.9530    0.9015      4291
         II     0.9145    0.9084    0.9831    0.9114    0.9450    0.8863      6419
        III     0.8968    0.9151    0.9947    0.9059    0.9541    0.9030      1861
         IV     0.8989    0.8847    0.9874    0.8917    0.9347    0.8646      4329
         IX     0.9091    0.9353    0.9957    0.9221    0.9650    0.9257      1701
          V     0.9034    0.9176    0.9572    0.9105    0.9372    0.8749     11723
         VI     0.6742    0.6835    0.9962    0.6788    0.8252    0.6596       436
        VII     0.9047    0.8822    0.9803    0.8933    0.9300    0.8564      6749
       VIII     0.8166    0.8352    0.9945    0.8258    0.9114    0.8173      1098

avg / total     0.9019    0.9018    0.9776    0.9018    0.9387    0.8749     38607



In [18]:
df_eval=pd.DataFrame([logits_all_letter, label_ids_all_letter]).T.rename(columns={0:'pred', 1:'true'})
len(df_eval[df_eval.pred==df_eval.true])/len(df_eval)

0.9018053720827829

In [19]:
df_eval.sample(10)

Unnamed: 0,pred,true
34288,I,I
16221,IV,IV
26597,V,V
217,V,VIII
13326,I,I
19788,II,II
505,VII,VII
9538,IX,IX
15469,I,I
12212,VII,VII


In [17]:
df_eval.to_excel('../../output/df_kap_broad_cat_BERT.xlsx')

### Test on nteeConf random

#### Load file

In [24]:
df_UCF_eval=pd.read_pickle('../../dataset/UCF/test/df_ucf_test_nteeConf_B.pkl.bz2')
df_UCF_eval['input']= df_UCF_eval['TAXPAYER_NAME']+' '+df_UCF_eval['mission_spellchk']+' '+df_UCF_eval['prgrm_dsc_spellchk']

# Code as 10 broad categories.
broad_cat_dict={'I': ['A'],
                'II': ['B'],
                'III': ['C', 'D'],
                'IV': ['E', 'F', 'G', 'H'],
                'V': ['I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'],
                'VI': ['Q'],
                'VII': ['R', 'S', 'T', 'U', 'V', 'W'],
                'VIII': ['X'],
                'IX': ['Y'],
                'X': ['Z'],
               }
def ntee2cat(string):
    global broad_cat_dict
    return [s for s in broad_cat_dict.keys() if string in broad_cat_dict[s]][0]

df_UCF_eval['broad_cat']=df_UCF_eval['NTEE1'].apply(ntee2cat)

# Create sentence and encoded label lists
sentences = df_UCF_eval.input.values
# labels should be numbers AND continious.
# "input for criterion should satisfy t >= 0 && t < n_classes"
# https://github.com/pytorch/pytorch/issues/1204#issuecomment-326958795
labels = preprocessing.LabelEncoder().fit_transform(df_UCF_eval.broad_cat.values)
labels = torch.tensor(labels)

In [25]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in tqdm(sentences):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer_loaded.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

100%|██████████| 5000/5000 [00:11<00:00, 451.93it/s] 

Original:  HARMONY FOUNDATION OF MILWAUKEE INC THE MISSION OF THE HARMONY FOUNDATION OF MILWAUKEE IS TO ESTABLISH AND OPERATE A SUSTAINABLE COMMUNITY GATHERING PLACE FOR DANCE PERFORMANCE , EDUCATION , OUTREACH , WELLNESS , AND RESEARCH PROGRAMMING . OUR VISION IS TO BURNISH MILWAUKEE IS NATIONAL REPUTATION IN THE ARTS THROUGH A WORLD-CLASS DESTINATION FOR DANCE , EDUCATION , AND WELLNESS . OUR FACILITY WILL BECOME A NATIONAL MODEL AND INSPIRED SPACE FOR COMMUNITY PROGRAMS PROVIDED BY THE MILWAUKEE BALLET COMPANY , THE UNIVERSITY OF WISCONSIN-MILWAUKEE PECK SCHOOL OF THE ARTS , AND THE MEDICAL COLLEGE OF WISCONSIN . NONE
Token IDs: tensor([  101,  9396,  3192,  1997,  9184,  4297,  1996,  3260,  1997,  1996,
         9396,  3192,  1997,  9184,  2003,  2000,  5323,  1998,  5452,  1037,
         9084,  2451,  7215,  2173,  2005,  3153,  2836,  1010,  2495,  1010,
        15641,  1010, 25860,  1010,  1998,  2470,  4730,  1012,  2256,  4432,
         2003,  2000,  6402,  4509,  9184,  2003




In [26]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_data = TensorDataset(input_ids, attention_masks, labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

#### Start to predict

In [27]:
### Set up environment.

import numpy as np
import time
import datetime
import random

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [28]:
# Put the model_loaded in evaluation mode--the dropout layers behave differently
# during evaluation.
model_loaded.cuda()
model_loaded.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
logits_all=[]
label_ids_all=[]

# Measure how long the training epoch takes.
t0 = time.time()

# Evaluate data for one epoch
for batch in validation_dataloader:

    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Telling the model_loaded not to compute or store gradients, saving memory and
    # speeding up validation
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model_loaded` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_loaded_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model_loaded(b_input_ids, 
                               token_type_ids=None, 
                               attention_mask=b_input_mask)

    # Get the "logits" output by the model_loaded. The "logits" are the output
    # values prior to applying an activation function like the softmax.
    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    logits_all+=list(np.argmax(logits, axis=1))
    label_ids = b_labels.to('cpu').numpy()
    label_ids_all+=list(label_ids)

    # Calculate the accuracy for this batch of test sentences.
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    # Accumulate the total accuracy.
    eval_accuracy += tmp_eval_accuracy

    # Track the number of batches
    nb_eval_steps += 1

# Report the final accuracy for this validation run.
print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("  Validation took: {:}".format(format_time(time.time() - t0)))

  Accuracy: 0.68
  Validation took: 0:00:24


In [29]:
logits_all_letter=preprocessing.LabelEncoder().fit(df_UCF_eval.broad_cat.values).inverse_transform(logits_all)
label_ids_all_letter=preprocessing.LabelEncoder().fit(df_UCF_eval.broad_cat.values).inverse_transform(label_ids_all)

In [30]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=label_ids_all_letter, y_pred=logits_all_letter, digits=4))

                   pre       rec       spe        f1       geo       iba       sup

          I     0.5912    0.8459    0.9603    0.6960    0.9013    0.8030       318
         II     0.6310    0.6287    0.9551    0.6298    0.7749    0.5809       544
        III     0.6476    0.7158    0.9846    0.6800    0.8395    0.6858       190
         IV     0.7108    0.7154    0.9592    0.7131    0.8284    0.6695       615
         IX     0.4478    0.6122    0.9925    0.5172    0.7795    0.5846        49
          V     0.7542    0.7481    0.8528    0.7511    0.7988    0.6313      1882
         VI     0.4451    0.5448    0.9813    0.4899    0.7312    0.5113       134
        VII     0.6361    0.5469    0.9347    0.5882    0.7150    0.4914       863
       VIII     0.6976    0.5753    0.9780    0.6306    0.7501    0.5400       405

avg / total     0.6848    0.6806    0.9180    0.6799    0.7875    0.6083      5000



In [31]:
df_eval=pd.DataFrame([logits_all_letter, label_ids_all_letter]).T.rename(columns={0:'pred', 1:'true'})
len(df_eval[df_eval.pred==df_eval.true])/len(df_eval)

0.6806

In [21]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=label_ids_all_letter, y_pred=logits_all_letter, digits=4))

                   pre       rec       spe        f1       geo       iba       sup

          I     0.8139    0.8880    0.9806    0.8493    0.9331    0.8627      3812
         II     0.8318    0.8265    0.9731    0.8291    0.8968    0.7925      6047
        III     0.8136    0.8462    0.9908    0.8296    0.9157    0.8263      1970
         IV     0.8114    0.8323    0.9747    0.8218    0.9007    0.7998      5046
         IX     0.8245    0.9042    0.9948    0.8626    0.9484    0.8913      1159
          V     0.8293    0.8478    0.9124    0.8384    0.8795    0.7685     14595
         VI     0.4933    0.6324    0.9867    0.5543    0.7899    0.6019       876
        VII     0.8046    0.7220    0.9613    0.7611    0.8331    0.6774      7906
       VIII     0.7563    0.6260    0.9890    0.6850    0.7869    0.5967      2265

avg / total     0.8104    0.8094    0.9540    0.8087    0.8775    0.7608     43676



In [22]:
df_eval=pd.DataFrame([logits_all_letter, label_ids_all_letter]).T.rename(columns={0:'pred', 1:'true'})
len(df_eval[df_eval.pred==df_eval.true])/len(df_eval)

0.809391885703819

In [23]:
df_eval.sample(10)

Unnamed: 0,pred,true
24838,V,V
38764,III,V
36186,I,VII
14734,VII,VII
7613,II,II
27437,VII,VII
22718,VI,VI
39176,VII,VII
4850,V,V
30641,VIII,VIII
