# Script purpose
- Notebook for developing `npoclass`

In [1]:
# # Force using CPU.
# # https://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

# obtain reproducible results

import numpy as np
import tensorflow as tf
import random as rn

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(42)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(12345)

# Force TensorFlow to use single thread. Multiple threads are a potential source of non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/
# session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
#                               inter_op_parallelism_threads=1)
# Use all threads.
session_conf = tf.ConfigProto()

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# Rest of code follows ...

# Check GPU device.
print(K.tensorflow_backend._get_available_gpus())



[]


Using TensorFlow backend.


In [2]:
from spellchecker import SpellChecker
from keras.preprocessing.sequence import pad_sequences
import pickle
from keras.models import load_model
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_list=[str.upper(s) for s in stopwords.words('english')+list(string.punctuation)]
# from multiprocessing import Pool # Consider multiprocessing letter.
import warnings
warnings.simplefilter('ignore')

import numpy as np
from keras.utils import np_utils
import os
import pandas as pd
import tensorflow as tf

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Test the loaded models and tokenizers.

In [3]:
model_broad_cat=load_model('../output/broad_category_model.h5')
model_major_group=load_model('../output/major_group_model.h5')
with open('../output/tokenizer.pkl', 'rb') as tokenizer_pkl:
    tokenizer = pickle.load(tokenizer_pkl)
with open('../output/lb_broad_cat.pkl', 'rb') as lb_broad_cat_pkl:
    lb_broad_cat = pickle.load(lb_broad_cat_pkl)
with open('../output/lb_major_group.pkl', 'rb') as lb_major_group_pkl:
    lb_major_group = pickle.load(lb_major_group_pkl)





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


#### Load UCF files

In [4]:
test_file_path='../dataset/UCF/test/'
file_list=os.listdir(test_file_path)
df_test=pd.DataFrame()
for file in file_list:
    df_test=pd.concat([df_test, pd.read_pickle(test_file_path+file, compression='gzip')])

# Code as 10 broad categories.
broad_cat_dict={'I': ['A'],
                'II': ['B'],
                'III': ['C', 'D'],
                'IV': ['E', 'F', 'G', 'H'],
                'V': ['I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'],
                'VI': ['Q'],
                'VII': ['R', 'S', 'T', 'U', 'V', 'W'],
                'VIII': ['X'],
                'IX': ['Y'],
                'X': ['Z'],
               }
def ntee2cat(string):
    global broad_cat_dict
    return [s for s in broad_cat_dict.keys() if string in broad_cat_dict[s]][0]

df_test['mission_prgrm_spellchk']=df_test['TAXPAYER_NAME']+' '+df_test['mission_spellchk']+' '+df_test['prgrm_dsc_spellchk'] # Using spell-checked.
df_test['broad_cat']=df_test['NTEE1'].apply(ntee2cat)
print(len(df_test['mission_prgrm_spellchk']), len(df_test['NTEE1'].drop_duplicates()), len(df_test['broad_cat'].drop_duplicates()))

text_list_test=df_test['mission_prgrm_spellchk']

# Text to sequences.
# seq_encoding_text_test=tokenizer.texts_to_sequences(spellcheck(input_string_list))
seq_encoding_text_test=tokenizer.texts_to_sequences(text_list_test)

# Pads sequences to the same length (i.e., prepare matrix).
x_test=pad_sequences(sequences=seq_encoding_text_test,
                    maxlen=46612, # Max length of the sequence.
                    dtype = "int32", padding = "post", truncating = "post", 
                    value = 0 # Zero is used for representing None or Unknown.
                     )

38607 25 9


_Load label binarizers instead of fitting._

<s>
    
```Python    
from sklearn import preprocessing
lb_major_group = preprocessing.LabelBinarizer()
lb_major_group.fit(['B', 'W', 'A', 'Y', 'L', 'K', 'P', 'N', 'F', 'I', 'E', 'D', 'M', 'S', 'X', 'T', 'C', 'J', 'G', 'U', 'H', 'O', 'Q', 'R', 'V'])

lb_broad_cat = preprocessing.LabelBinarizer()
lb_broad_cat.fit(['II', 'VII', 'I', 'IX', 'V', 'IV', 'III', 'VIII', 'VI']) # The label order complies with those in trained models.
```
    
</s>

In [None]:
# Use GPU.
with tf.device('/device:GPU:1'):
    y_prob_broad_cat=model_broad_cat.predict(x_test, verbose=1)
    y_prob_major_group=model_major_group.predict(x_test, verbose=1)

In [5]:
# Use CPU -- Too slow.
# with tf.device('/device:CPU:0'):
y_prob_broad_cat=model_broad_cat.predict(x_test, verbose=1)
y_prob_major_group=model_major_group.predict(x_test, verbose=1)



In [6]:
y_pred_broad_cat = lb_broad_cat.inverse_transform(np_utils.to_categorical(y_prob_broad_cat.argmax(axis=-1)))
y_pred_major_group = lb_major_group.inverse_transform(np_utils.to_categorical(y_prob_major_group.argmax(axis=-1)))

In [7]:
df_val_broad_cat=pd.DataFrame({'pred':y_pred_broad_cat, 
                               'true':df_test['broad_cat'],
                              })
print('Overall ACC:', len(df_val_broad_cat[df_val_broad_cat.pred==df_val_broad_cat.true])/len(df_val_broad_cat))

Overall ACC: 0.8323102028129614


In [8]:
df_val_major_group=pd.DataFrame({'pred':y_pred_major_group, 
                               'true':df_test['NTEE1'],
                              })
print('Overall ACC:', len(df_val_major_group[df_val_major_group.pred==df_val_major_group.true])/len(df_val_major_group))

Overall ACC: 0.7773719791747611


In [30]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=df_test['NTEE1'], y_pred=y_pred_major_group))

                   pre       rec       spe        f1       geo       iba       sup

          A       0.86      0.84      0.98      0.85      0.91      0.81      4291
          B       0.85      0.86      0.97      0.85      0.91      0.82      6419
          C       0.71      0.65      0.99      0.68      0.80      0.63       827
          D       0.86      0.89      1.00      0.87      0.94      0.87      1034
          E       0.76      0.78      0.98      0.77      0.88      0.75      2307
          F       0.48      0.60      0.99      0.53      0.77      0.57       543
          G       0.68      0.60      0.99      0.64      0.77      0.57      1353
          H       0.45      0.04      1.00      0.07      0.20      0.04       126
          I       0.62      0.72      0.99      0.66      0.85      0.70       740
          J       0.73      0.75      0.99      0.74      0.86      0.73      1132
          K       0.75      0.67      1.00      0.71      0.82      0.64       522
   

In [20]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=df_test['broad_cat'], y_pred=y_pred_broad_cat))

                   pre       rec       spe        f1       geo       iba       sup

          I       0.85      0.87      0.98      0.86      0.93      0.85      4291
         II       0.88      0.83      0.98      0.86      0.90      0.80      6419
        III       0.88      0.79      0.99      0.83      0.89      0.77      1861
         IV       0.88      0.76      0.99      0.82      0.87      0.74      4329
         IX       0.89      0.88      1.00      0.89      0.94      0.87      1701
          V       0.86      0.85      0.94      0.85      0.89      0.79     11723
         VI       0.53      0.31      1.00      0.39      0.55      0.28       436
        VII       0.75      0.86      0.94      0.80      0.90      0.80      6749
       VIII       0.59      0.83      0.98      0.69      0.90      0.80      1098

avg / total       0.84      0.83      0.96      0.83      0.89      0.79     38607



<b><font color="green">GPU tests all passed.</font></b>

### Export for Kappa (inercoder reliabity) test

In [27]:
df_kap_major_group=pd.DataFrame([df_test['NTEE1'].tolist(), y_pred_major_group.tolist()]).T.rename(columns={0:'NTEE1', 1:'pred'})
df_kap_major_group.to_excel('../output/df_kap_major_group.xlsx')

In [31]:
df_kap_broad_cat=pd.DataFrame([df_test['broad_cat'].tolist(), y_pred_broad_cat.tolist()]).T.rename(columns={0:'broad_cat', 1:'pred'})
df_kap_broad_cat.to_excel('../output/df_kap_broad_cat.xlsx')

### Develop API script.

In [1]:
################################### Define reproducibility ##########################
# # Force using CPU.
# # https://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

# obtain reproducible results

import numpy as np
import tensorflow as tf
import random as rn

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(42)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(12345)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)


################################### Import dependencies ##########################
from spellchecker import SpellChecker
from keras.preprocessing.sequence import pad_sequences
import pickle
from keras.models import load_model
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_list=[str.upper(s) for s in stopwords.words('english')+list(string.punctuation)]
import warnings
warnings.simplefilter('ignore')
import numpy as np
from keras.utils import np_utils
import os
import pandas as pd
import tensorflow as tf
from multiprocessing import Pool
p=Pool()


################################### Load saved models and classes ##########################
model_broad_cat=load_model('../output/broad_category_model.h5')
model_major_group=load_model('../output/major_group_model.h5')
with open('../output/tokenizer.pkl', 'rb') as tokenizer_pkl:
    tokenizer = pickle.load(tokenizer_pkl)
with open('../output/lb_broad_cat.pkl', 'rb') as lb_broad_cat_pkl:
    lb_broad_cat = pickle.load(lb_broad_cat_pkl)
with open('../output/lb_major_group.pkl', 'rb') as lb_major_group_pkl:
    lb_major_group = pickle.load(lb_major_group_pkl)
    
# String/String list input --> a list of string token list(s) --> spellchecking (parallel) --> predict class (serial).


################################### Define functions ##########################
def npoclass(string_input=None):
    ## Define local function.
    # Spell check function. Return corrected word if unknown; return original word if known.
    def spellcheck(input_string):
        if type(input_string)==str:
            word_token_list=nltk.word_tokenize(input_string)
            return [s.upper() for s in p.map(SpellChecker().correction, word_token_list)]
        elif type(input_string)==list:
            word_token_list_list=[nltk.word_tokenize(string) for string in input_string]
            word_token_list_list_chk=[]
            for word_token_list in word_token_list_list:
                word_token_list_list_chk+=[[s.upper() for s in p.map(SpellChecker().correction, word_token_list)]]
            return word_token_list_list_chk
        else:
            raise NameError('Input must be a string or a list of strings.')
    
    result_dict={}
    # Text to sequences.
    # seq_encoding_text_test=tokenizer.texts_to_sequences(spellcheck(input_string_list))
    seq_encoding_text=tokenizer.texts_to_sequences(spellcheck(string_input))
    # Pads sequences to the same length (i.e., prepare matrix).
    x_text=pad_sequences(sequences=seq_encoding_text,
                        maxlen=46612, # Max length of the sequence.
                        dtype = "int32", padding = "post", truncating = "post", 
                        value = 0 # Zero is used for representing None or Unknown.
                         )
    # Predict.
    y_prob=model_major_group.predict(x_text)
    result_dict['major_group_label']=lb_major_group.inverse_transform(np_utils.to_categorical(y_prob.argmax(axis=-1))).tolist()
    result_dict['major_group_prob']=[s.max() for s in y_prob]
    y_prob=model_broad_cat.predict(x_text)
    result_dict['broad_category_label']=lb_broad_cat.inverse_transform(np_utils.to_categorical(y_prob.argmax(axis=-1))).tolist()
    result_dict['broad_category_prob']=[s.max() for s in y_prob]
    return result_dict

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
input_string_list=['environment, environment, environment, environment, environment, environment', 'Greenpeace is a global, independent campaigning organization that uses peaceful protest and creative communication to expose global environmental problems and promote solutions that are essential to a green and peaceful future.']
npoclass(input_string_list)

{'major_group_label': ['N', 'C'],
 'major_group_prob': [0.0013737775, 0.0040711625],
 'broad_category_label': ['III', 'VII'],
 'broad_category_prob': [0.012432104, 0.007314923]}

### Check consistency between function and direct prediction.

[=======> **Load UCF test files.**](#load-UCF-files)

In [4]:
y_class_direct=lb_broad_cat.inverse_transform(np_utils.to_categorical(model_broad_cat.predict(x_test[0:100], verbose=1).argmax(axis=-1))).tolist()
y_class_function=npoclass(text_list_test[0:100].tolist())
df_result_broad_cat100=pd.DataFrame({'direct':y_class_direct[0:100], 
                                     'function':y_class_function['broad_category_label']})
print(len(df_result_broad_cat100[df_result_broad_cat100.direct==df_result_broad_cat100.function])/len(df_result_broad_cat100))

0.99


In [5]:
y_class_direct=lb_major_group.inverse_transform(np_utils.to_categorical(model_major_group.predict(x_test[0:100], verbose=1).argmax(axis=-1))).tolist()
y_class_function=npoclass(text_list_test[0:100].tolist())
df_result_major_group100=pd.DataFrame({'direct':y_class_direct[0:100], 
                                       'function':y_class_function['major_group_label']})
print(len(df_result_major_group100[df_result_major_group100.direct==df_result_major_group100.function])/len(df_result_major_group100))

0.96
