In [1]:
#set up environment
import os, torch, pickle, warnings, random, joblib, math, itertools
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, BertTokenizer
from tqdm import tqdm
import tensorflow as tf
warnings.filterwarnings("ignore")
from time import sleep
from joblib import Parallel, delayed
from tlz import partition_all
from multiprocessing import Pool

################################### Define functions ##########################
def npoclass(inputs, gpu_core=True, model_path=None, ntee_type='bc', n_jobs=4, backend='multiprocessing'):
    
    # Set the seed value all over the place to make this reproducible.
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)

    # Check model files.
    if ntee_type=='bc' and model_path==None:
        raise ValueError("Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_bc.zip, unzip, and specifiy model_path (default set to None).")
    if ntee_type=='mg' and model_path==None:
        raise ValueError("Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_mg.zip, unzip, and specifiy model_path (default set to None).")
        
    # Check ntee type.
    if ntee_type=='bc':
        le_file_name='le_broad_cat.pkl'
    elif ntee_type=='mg':
        le_file_name='le_major_group.pkl'
    else:
        raise ValueError("ntee_type must be 'bc' (broad category) or 'mg' (major group)")

    # Read model and label encoder, if not read.
    global model_loaded, tokenizer_loaded, label_encoder
    try:
        assert model_loaded
        assert tokenizer_loaded
        assert label_encoder
    except:
        #load a pretrained model and tokenizer.
        model_loaded = BertForSequenceClassification.from_pretrained(model_path)
        tokenizer_loaded = BertTokenizer.from_pretrained(model_path)
        # Read label encoder.
        with open(model_path+le_file_name, 'rb') as label_encoder_pkl:
            label_encoder = pickle.load(label_encoder_pkl)
    
    # Select acceleration method.
    if gpu_core==True and torch.cuda.is_available():
        print('There are %d GPU(s) available.' % torch.cuda.device_count(), 'Using GPU:',torch.cuda.get_device_name(0))
        torch.cuda.manual_seed_all(seed_val)
        device = torch.device('cuda')
        model_loaded.cuda()
    else:
        print('No GPU acceleration available or gpu_core=False, using CPU.')
        device = torch.device('cpu')
        model_loaded.cpu()
    print('Encoding inputs ...')
    sleep(.5) # Pause a second for better printing results.
    
    # Encode inputs.
    global func_encode_string, func_encode_string_batch # Define as global, otherwise cannot pickle or very slow.
    def func_encode_string(text_string):
        encoded_dict = tokenizer_loaded.encode_plus(text_string,
                                                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                                    max_length = 256,           # Pad & truncate all sentences.
                                                    truncation=True,
                                                    pad_to_max_length = True,
                                                    return_attention_mask = True,   # Construct attn. masks.
                                                    return_tensors = 'pt',     # Return pytorch tensors.
                                                   )
        return encoded_dict
    def func_encode_string_batch(text_strings):
        encoded_dicts=[]
        for text_string in text_strings:
            encoded_dicts+=[func_encode_string(text_string)]
        return encoded_dicts

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []
    # Encode input string(s).
    if type(inputs)==list:
        if backend=='multiprocessing': # Multiprocessing is faster than loky in processing large objects.
            encoded_outputs=Parallel(n_jobs=n_jobs, backend="multiprocessing", batch_size='auto', verbose=1)(delayed(func_encode_string)(text_string) for text_string in inputs)
            for encoded_output in encoded_outputs:
                # Add the encoded sentence to the list.
                input_ids.append(encoded_output['input_ids'])
                # And its attention mask (simply differentiates padding from non-padding).
                attention_masks.append(encoded_output['attention_mask'])
        elif backend=='sequential':
            for text_string in tqdm(inputs):
                encoded_output=func_encode_string(text_string)
                # Add the encoded sentence to the list.
                input_ids.append(encoded_output['input_ids'])
                # And its attention mask (simply differentiates padding from non-padding).
                attention_masks.append(encoded_output['attention_mask'])
        elif backend=='dask':
            with joblib.parallel_backend('dask'):
                n_jobs=len(client.scheduler_info()['workers']) # Get # works.
                string_chunks = partition_all(math.ceil(len(inputs)/n_jobs), inputs)  # Collect into groups of size 1000
                encoded_outputs=Parallel(n_jobs=-1, batch_size='auto', verbose=1)(delayed(func_encode_string_batch)(text_strings) for text_strings in string_chunks)
                encoded_outputs=itertools.chain(*encoded_outputs)
            for encoded_output in encoded_outputs:
                # Add the encoded sentence to the list.
                input_ids.append(encoded_output['input_ids'])
                # And its attention mask (simply differentiates padding from non-padding).
                attention_masks.append(encoded_output['attention_mask'])           
    if type(inputs)==str:
        encoded_output=func_encode_string(inputs)
        input_ids=[encoded_output['input_ids']]
        attention_masks=[encoded_output['attention_mask']]

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Prepare dataloader for efficient calculation.
    batch_size = 64
    pred_data = TensorDataset(input_ids, attention_masks)
    pred_sampler = SequentialSampler(pred_data)
    pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=batch_size)

    # Start prediction.
    model_loaded.eval()
    logits_all=[]
    print('Predicting categories ...')
    sleep(.5) # Pause a second for better printing results.
    for batch in tqdm(pred_dataloader, mininterval=10):
        # Add batch to the pre-chosen device
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch
        with torch.no_grad():
            outputs = model_loaded(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits_all+=outputs[0].tolist()

    # Calculate probabilities of logitcs.
    logits_prob=tf.nn.sigmoid(logits_all).numpy().tolist()
    # Find the positions of max values in logits.
    logits_max=np.argmax(logits_prob, axis=1)
    # Transfer to labels.
    logits_labels=label_encoder.inverse_transform(logits_max)
    
    # Compile results to be returned.
    result_list=[]
    for list_index in range(0, len(logits_labels)):
        result_dict={}
        result_dict['recommended']=logits_labels[list_index]
        conf_prob=logits_prob[list_index][logits_max[list_index]]
        if conf_prob>=.99:
            result_dict['confidence']='high (>=.99)'
        elif conf_prob>=.95:
            result_dict['confidence']='medium (<.99|>=.95)'
        else:
            result_dict['confidence']='low (<.95)'
        prob_dict={}
        for label_index in range(0, len(label_encoder.classes_)):
            prob_dict[label_encoder.classes_[label_index]]=logits_prob[list_index][label_index]
        result_dict['probabilities']=prob_dict
        result_list+=[result_dict]

    return result_list

In [4]:
import requests
string = requests.get('https://github.com/ma-ji/npo_classifier').text[0:100000]

In [None]:
t=npoclass([string]*2000, 
           gpu_core=True, model_path='../../npoclass_model_bc/', backend='sequential')

# >1h

In [3]:
t=npoclass([string]*2000, 
           gpu_core=True, model_path='../../npoclass_model_bc/', n_jobs=40)

There are 1 GPU(s) available. Using GPU: Quadro RTX 6000
Encoding inputs ...


[Parallel(n_jobs=40)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done  10 tasks      | elapsed:    5.7s
[Parallel(n_jobs=40)]: Done 160 tasks      | elapsed:   17.9s
[Parallel(n_jobs=40)]: Done 410 tasks      | elapsed:   39.3s
[Parallel(n_jobs=40)]: Done 760 tasks      | elapsed:  1.2min
[Parallel(n_jobs=40)]: Done 1210 tasks      | elapsed:  1.8min
[Parallel(n_jobs=40)]: Done 1760 tasks      | elapsed:  2.6min
[Parallel(n_jobs=40)]: Done 2000 out of 2000 | elapsed:  2.9min finished


Predicting categories ...


100%|██████████| 32/32 [00:09<00:00,  3.41it/s]


In [8]:
t=npoclass([string]*2000, 
           gpu_core=True, model_path='../../npoclass_model_bc/', n_jobs=40)

There are 1 GPU(s) available. Using GPU: Quadro RTX 6000
Encoding inputs ...


[Parallel(n_jobs=40)]: Using backend MultiprocessingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done  10 tasks      | elapsed:    3.9s
[Parallel(n_jobs=40)]: Done 160 tasks      | elapsed:   18.1s
[Parallel(n_jobs=40)]: Done 410 tasks      | elapsed:   44.2s
[Parallel(n_jobs=40)]: Done 760 tasks      | elapsed:  1.3min
[Parallel(n_jobs=40)]: Done 1210 tasks      | elapsed:  2.1min
[Parallel(n_jobs=40)]: Done 1760 tasks      | elapsed:  3.1min
[Parallel(n_jobs=40)]: Done 2000 out of 2000 | elapsed:  3.5min finished


Predicting categories ...


100%|██████████| 32/32 [00:09<00:00,  3.40it/s]


In [7]:
t=npoclass([string]*2000, 
           gpu_core=True, model_path='../../npoclass_model_bc/', backend='dask')

There are 1 GPU(s) available. Using GPU: Quadro RTX 6000
Encoding inputs ...


[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 120 concurrent workers.
[Parallel(n_jobs=-1)]: Done 118 out of 118 | elapsed:  1.6min finished


Predicting categories ...


100%|██████████| 32/32 [00:09<00:00,  3.40it/s]


In [4]:
from dask.distributed import Client
client = Client("10.140.82.44:8786")
client

0,1
Client  Scheduler: tcp://10.140.82.44:8786  Dashboard: http://10.140.82.44:8787/status,Cluster  Workers: 120  Cores: 120  Memory: 240.00 GB


In [10]:
t[0:3]

[{'recommended': 'VII',
  'confidence': 'high (>=.99)',
  'probabilities': {'I': 0.7859950661659241,
   'II': 0.6022496223449707,
   'III': 0.6815308928489685,
   'IV': 0.7988898158073425,
   'IX': 0.46323516964912415,
   'V': 0.7328329086303711,
   'VI': 0.42189300060272217,
   'VII': 0.9975772500038147,
   'VIII': 0.2866130769252777}},
 {'recommended': 'VII',
  'confidence': 'high (>=.99)',
  'probabilities': {'I': 0.7859950661659241,
   'II': 0.6022496223449707,
   'III': 0.6815308928489685,
   'IV': 0.7988898158073425,
   'IX': 0.46323516964912415,
   'V': 0.7328329086303711,
   'VI': 0.42189300060272217,
   'VII': 0.9975772500038147,
   'VIII': 0.2866130769252777}},
 {'recommended': 'VII',
  'confidence': 'high (>=.99)',
  'probabilities': {'I': 0.7859950661659241,
   'II': 0.6022496223449707,
   'III': 0.6815308928489685,
   'IV': 0.7988898158073425,
   'IX': 0.46323516964912415,
   'V': 0.7328329086303711,
   'VI': 0.42189300060272217,
   'VII': 0.9975772500038147,
   'VIII': 0.

In [5]:
t=npoclass('educators service, environment tree protection', gpu_core=True, model_path='../../npoclass_model_bc/')

There are 1 GPU(s) available. Using GPU: Quadro RTX 6000
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:00<00:00, 23.73it/s]


In [6]:
t

[{'recommended': 'II',
  'confidence': 'high (>=.99)',
  'probabilities': {'I': 0.5053212642669678,
   'II': 0.9996891021728516,
   'III': 0.7522097826004028,
   'IV': 0.605323076248169,
   'IX': 0.20629839599132538,
   'V': 0.9766567945480347,
   'VI': 0.2705982029438019,
   'VII': 0.8041078448295593,
   'VIII': 0.3203430771827698}}]

In [6]:
t=npoclass('educators service, environment tree protection', gpu_core=True, ntee_type='bc')

There are 1 GPU(s) available. Using GPU: Quadro RTX 6000
Encoding inputs ...
Predicting categories ...


100%|██████████| 1/1 [00:00<00:00, 31.94it/s]


## Test API

In [1]:
import requests
exec(requests.get('https://raw.githubusercontent.com/ma-ji/npo_classifier/master/API/npoclass.py').text)

In [2]:
requests.get('https://raw.githubusercontent.com/ma-ji/npo_classifier/master/API/npoclass.py').text



In [3]:
df_UCF_eval=pd.read_pickle('../dataset/UCF/test/df_ucf_test.pkl.gz')
df_UCF_eval['input']= df_UCF_eval['TAXPAYER_NAME']+' '+df_UCF_eval['mission_spellchk']+' '+df_UCF_eval['prgrm_dsc_spellchk']

# Code as 10 broad categories.
broad_cat_dict={'I': ['A'],
                'II': ['B'],
                'III': ['C', 'D'],
                'IV': ['E', 'F', 'G', 'H'],
                'V': ['I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'],
                'VI': ['Q'],
                'VII': ['R', 'S', 'T', 'U', 'V', 'W'],
                'VIII': ['X'],
                'IX': ['Y'],
                'X': ['Z'],
               }
def ntee2cat(string):
    global broad_cat_dict
    return [s for s in broad_cat_dict.keys() if string in broad_cat_dict[s]][0]

df_UCF_eval['broad_cat']=df_UCF_eval['NTEE1'].apply(ntee2cat)

# Create sentence and encoded label lists
sentences = df_UCF_eval.input.values.tolist()

In [4]:
eval_results=npoclass(sentences, model_path='../../npoclass_model_bc/', backend='multiprocessing')

There are 1 GPU(s) available. Using GPU: Quadro RTX 6000
Encoding inputs ...


[Parallel(n_jobs=4)]: Using backend MultiprocessingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 312 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 9720 tasks      | elapsed:    6.1s
[Parallel(n_jobs=4)]: Done 25720 tasks      | elapsed:   16.9s
[Parallel(n_jobs=4)]: Done 38607 out of 38607 | elapsed:   24.8s finished


Predicting categories ...


100%|██████████| 604/604 [03:05<00:00,  3.26it/s]


In [5]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=df_UCF_eval.broad_cat, y_pred=[s['recommended'] for s in eval_results], digits=4))

                   pre       rec       spe        f1       geo       iba       sup

          I     0.9220    0.9170    0.9903    0.9195    0.9530    0.9015      4291
         II     0.9145    0.9084    0.9831    0.9114    0.9450    0.8863      6419
        III     0.8968    0.9151    0.9947    0.9059    0.9541    0.9030      1861
         IV     0.8989    0.8847    0.9874    0.8917    0.9347    0.8646      4329
         IX     0.9091    0.9353    0.9957    0.9221    0.9650    0.9257      1701
          V     0.9034    0.9176    0.9572    0.9105    0.9372    0.8749     11723
         VI     0.6742    0.6835    0.9962    0.6788    0.8252    0.6596       436
        VII     0.9047    0.8822    0.9803    0.8933    0.9300    0.8564      6749
       VIII     0.8166    0.8352    0.9945    0.8258    0.9114    0.8173      1098

avg / total     0.9019    0.9018    0.9776    0.9018    0.9387    0.8749     38607



#### Original test scores (READ ONLY).

In [4]:
eval_results=npoclass(sentences)

There are 1 GPU(s) available. Using GPU: Quadro RTX 6000
Encoding inputs ...


100%|██████████| 38607/38607 [01:31<00:00, 419.77it/s]


Predicting categories ...


100%|██████████| 1207/1207 [03:15<00:00,  6.17it/s]


In [7]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=df_UCF_eval.broad_cat, y_pred=[s['recommended'] for s in eval_results], digits=4))

                   pre       rec       spe        f1       geo       iba       sup

          I     0.9220    0.9170    0.9903    0.9195    0.9530    0.9015      4291
         II     0.9145    0.9084    0.9831    0.9114    0.9450    0.8863      6419
        III     0.8968    0.9151    0.9947    0.9059    0.9541    0.9030      1861
         IV     0.8989    0.8847    0.9874    0.8917    0.9347    0.8646      4329
         IX     0.9091    0.9353    0.9957    0.9221    0.9650    0.9257      1701
          V     0.9034    0.9176    0.9572    0.9105    0.9372    0.8749     11723
         VI     0.6742    0.6835    0.9962    0.6788    0.8252    0.6596       436
        VII     0.9047    0.8822    0.9803    0.8933    0.9300    0.8564      6749
       VIII     0.8166    0.8352    0.9945    0.8258    0.9114    0.8173      1098

avg / total     0.9019    0.9018    0.9776    0.9018    0.9387    0.8749     38607

