In [0]:
#set up environment
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import *
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import time
import datetime
import random
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
from sklearn import preprocessing

In [0]:
################################### Define functions ##########################
def npoclass(string_input):
    
    #define local functions
    #this function will check whether there is GPU acceleration avaliable. 
    #If there is, the function will let the user choose whether the program should use the GPU or CPU.
    #If not, the program will automatically use CPU
    
    def device_selection():
        if torch.cuda.is_available():    

            print('There are %d GPU(s) available.' % torch.cuda.device_count())

            print('Do you want to the GPU accelaration?')

            selection = input("please enter 'yes' or 'no.'")

            if selection == 'yes':

                device = torch.device('cuda')
                print('Using GPU:',torch.cuda.get_device_name(0))

            elif selection =='no':

                print('Using the CPU instead.')
                device = torch.device("cpu")

            elif selection != 'no' or selection != 'yes':
                print("Wrong selection.\n\n")

                device_selection()

        else:
            print('No GPU available, using the CPU instead.')
            device = torch.device("cpu")
        return device

    #this local function will load a pretrained model and make prediction. 
    def load_model(dir_path, labels):
        #load a pretrained model
        model_loaded = BertForSequenceClassification.from_pretrained(dir_path) 
        tokenizer_loaded = BertTokenizer.from_pretrained(dir_path)  

        # Tokenize all of the sentences and map the tokens to thier word IDs.
        input_ids = []
        attention_masks = []

        # For every sentence...
        for sent in tqdm(string_input):

            encoded_dict = tokenizer_loaded.encode_plus(
                                sent,                      
                                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                max_length = 256,           # Pad & truncate all sentences.
                                pad_to_max_length = True,
                                return_attention_mask = True,   # Construct attn. masks.
                                return_tensors = 'pt',     # Return pytorch tensors.
                           )

            # Add the encoded sentence to the list.    
            input_ids.append(encoded_dict['input_ids'])

            # And its attention mask (simply differentiates padding from non-padding).
            attention_masks.append(encoded_dict['attention_mask'])

        # Convert the lists into tensors.
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        #prepare dataloader
        batch_size = 32
        test_data = TensorDataset(input_ids, attention_masks)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

        #this local function will setup timer for recording prediction time.
        def format_time(elapsed):
                '''
                Takes a time in seconds and returns a string hh:mm:ss
                '''
                # Round to the nearest second.
                elapsed_rounded = int(round((elapsed)))

                # Format as hh:mm:ss
                return str(datetime.timedelta(seconds=elapsed_rounded))

        #if the use choose to use GPU, the model will be load to GPU.
        if device == 'cuda':
            # Set the seed value all over the place to make this reproducible.
            seed_val = 42
            random.seed(seed_val)
            np.random.seed(seed_val)
            torch.manual_seed(seed_val)
            torch.cuda.manual_seed_all(seed_val)
            #load model to GPU
            model_loaded.cuda()

        logits_all=[]
        label_ids_all=[]
        logits_argmax=[]
        logits_amax=[]
        t0 = time.time()

        for batch in test_dataloader:

            # Add batch to the pre-chosen device
            batch = tuple(t.to(device) for t in batch)

            b_input_ids, b_input_mask = batch

            with torch.no_grad():        

                outputs = model_loaded(b_input_ids, 
                                       token_type_ids=None, 
                                       attention_mask=b_input_mask)


            logits = outputs[0]
            
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            
            #find the max value of logits
            logits_amax += list(np.amax(logits, axis=1))
            logits_argmax += list(np.argmax(logits, axis=1))
        
        print("  Prediction took: {:}".format(format_time(time.time() - t0)))
        #use softmax to calculate the probability
        propability=tf.nn.softmax(logits_amax)
        
        #convert propability to a numpy array
        propability= tf.constant(propability).numpy()
        #convert logits to labels
        logits_all_letter=preprocessing.LabelEncoder().fit(labels).inverse_transform(logits_argmax)

        return logits_all_letter,propability

    #setup classification labels
    broad_cat = ['I','II','III','IV','V','VI',
                    'VII','VIII','IX','X']

    major_group= ['A','B','C', 'D','E', 'F', 'G', 'H','I', 'J', 'K', 'L', 'M', 'N', 
                  'O', 'P','Q','R', 'S', 'T', 'U', 'V', 'W','X','Y', 'Z']
    
    device = device_selection()
    
    #run the loading model function with major group classification
    major_group_label, major_group_prob = load_model(dir_path='new_classifier_mg/', labels = major_group)
    #run the loading model function with broad category classification
    broad_category_label, broad_category_prob = load_model(dir_path='new_classifier_bc/', labels = broad_cat)
    
    return major_group_label, major_group_prob, broad_category_label, broad_category_prob

# Test the API

In [0]:
# preparing test dataset
#we will test 10 samples
df_UCF_test=pd.concat([pd.read_pickle('test/df_ucf_test.pkl.gz', compression='gzip')],ignore_index=True)
df_UCF_test = df_UCF_test.sample(10)
df_UCF_test['input']= df_UCF_test['TAXPAYER_NAME']+' '+df_UCF_test['mission_spellchk']+' '+df_UCF_test['prgrm_dsc_spellchk']
string_input = df_UCF_test.input.values

#prepare labels
broad_cat_dict={'I': ['A'],
                'II': ['B'],
                'III': ['C', 'D'],
                'IV': ['E', 'F', 'G', 'H'],
                'V': ['I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'],
                'VI': ['Q'],
                'VII': ['R', 'S', 'T', 'U', 'V', 'W'],
                'VIII': ['X'],
                'IX': ['Y'],
                'X': ['Z'],
               }
def ntee2cat(string):
    global broad_cat_dict
    return [s for s in broad_cat_dict.keys() if string in broad_cat_dict[s]][0]
#add true broad_cat labels to the test dataset
df_UCF_test['broad_cat']=df_UCF_test['NTEE1'].apply(ntee2cat)

In [0]:
major_group_label, major_group_prob, broad_category_label, broad_category_prob = npoclass(string_input)

No GPU available, using the CPU instead.


100%|██████████| 10/10 [00:00<00:00, 445.29it/s]


  Prediction took: 0:00:01


100%|██████████| 10/10 [00:00<00:00, 443.59it/s]


  Prediction took: 0:00:01


In [0]:
print('broad category_label:', broad_category_label)
print('broad category probability:', broad_category_prob)
print('major_group_label:', major_group_label)
print('major_group_probability:', major_group_prob)

broad category_label: ['V' 'II' 'V' 'I' 'V' 'VII' 'V' 'VII' 'II' 'I']
broad category probability: [0.12579422 0.01758762 0.09651593 0.03289149 0.00776044 0.17845915
 0.11803251 0.16834667 0.11491693 0.13969506]
major_group_label: ['K' 'B' 'M' 'A' 'A' 'S' 'J' 'W' 'B' 'A']
major_group_probability: [0.05267935 0.00246418 0.03420285 0.00445164 0.00111741 0.20181774
 0.03183997 0.25103763 0.1722936  0.24809568]


In [0]:
#see true broad category labels
df_UCF_test['broad_cat'].values
#got one prediction wrong

array(['V', 'III', 'V', 'I', 'V', 'VII', 'V', 'VII', 'II', 'I'],
      dtype=object)

In [0]:
t_major=pd.DataFrame([major_group_label, major_group_prob]).T.rename(columns={0:'pred', 1:'prob'})

In [0]:
t_major

Unnamed: 0,pred,prob
0,K,0.0526793
1,B,0.00246418
2,M,0.0342029
3,A,0.00445164
4,A,0.00111741
5,S,0.201818
6,J,0.03184
7,W,0.251038
8,B,0.172294
9,A,0.248096


Unnamed: 0,pred,pro
0,S,0.0681582
1,S,0.162789
2,E,0.095232
3,E,0.0291983
4,M,0.164544
5,E,0.0182485
6,B,0.18929
7,C,0.0552726
8,A,0.211201
9,X,0.00606566
