Draft codes saved in `NN_broad_cat.ipynb`.

In [1]:
# obtain reproducible results

import numpy as np
import tensorflow as tf
import random as rn

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(42)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(12345)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# Rest of code follows ...

# Check GPU device.
print(K.tensorflow_backend._get_available_gpus())

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1']


In [2]:
#https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/
#https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
#RNN

import os
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool
from spellchecker import SpellChecker
import string
import math

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from sklearn.model_selection import train_test_split
from nltk import word_tokenize

# For encoding labels.
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

## Read and compile tranining and validation dataset.

In [3]:
train_file_path='../../dataset/UCF/train/'
file_list=os.listdir(train_file_path)
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, pd.read_pickle(train_file_path+file, compression='gzip')])

test_file_path='../../dataset/UCF/test/'
file_list=os.listdir(test_file_path)
df_test=pd.DataFrame()
for file in file_list:
    df_test=pd.concat([df_test, pd.read_pickle(test_file_path+file, compression='gzip')])
    
len(df_train), len(df_test)

(154424, 38607)

In [4]:
df_train['mission_prgrm_spellchk']=df_train['TAXPAYER_NAME']+' '+df_train['mission_spellchk']+' '+df_train['prgrm_dsc_spellchk'] # Using spell-checked.
print(len(df_train['mission_prgrm_spellchk']), len(df_train['NTEE1'].drop_duplicates()))

df_test['mission_prgrm_spellchk']=df_test['TAXPAYER_NAME']+' '+df_test['mission_spellchk']+' '+df_test['prgrm_dsc_spellchk'] # Using spell-checked.
print(len(df_test['mission_prgrm_spellchk']), len(df_test['NTEE1'].drop_duplicates()))

154424 25
38607 25


In [5]:
# # Build training and testing data frame.
# small_num=0
# while small_num<500: # Make sure each category has at least 500 records.
#     sampleDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(120000)
#     trainDF, valDF =train_test_split(sampleDF, test_size=.3)
#     small_num=trainDF.groupby('NTEE_M').count().sort_values('EIN').iloc[0]['EIN']

# See the composition by NTEE major groups.
print(df_train.groupby('NTEE1').count()['EIN']/len(df_train), '\n'*2, df_test.groupby('NTEE1').count()['EIN']/len(df_test))

NTEE1
A    0.110151
B    0.167247
C    0.021519
D    0.027450
E    0.058378
F    0.014901
G    0.032722
H    0.003024
I    0.019084
J    0.030902
K    0.013010
L    0.038478
M    0.030390
N    0.100114
O    0.011209
P    0.059447
Q    0.012867
R    0.006890
S    0.093632
T    0.013159
U    0.006476
V    0.002266
W    0.054117
X    0.029568
Y    0.042998
Name: EIN, dtype: float64 

 NTEE1
A    0.111146
B    0.166265
C    0.021421
D    0.026783
E    0.059756
F    0.014065
G    0.035045
H    0.003264
I    0.019168
J    0.029321
K    0.013521
L    0.039811
M    0.029528
N    0.101666
O    0.010594
P    0.060041
Q    0.011293
R    0.006657
S    0.093325
T    0.014013
U    0.005828
V    0.002202
W    0.052788
X    0.028440
Y    0.044059
Name: EIN, dtype: float64


### Prepare labels.
*One hot encoding.* Prepare after resampling; otherwise, shape of `y_train` will shrink from 25 to 3.

In [6]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(list(df_train.NTEE1.unique()))

y_train=lb.transform(df_train['NTEE1'])
# y_test=lb.transform(df_test['NTEE1']) # No need to transform Y for testing.

In [7]:
import pickle
# Save LabelBinarizer class for developing package.
with open('../../output/lb_major_group.pkl', 'wb') as output:
    pickle.dump(lb, output, pickle.HIGHEST_PROTOCOL)

### Prepare input text.

In [8]:
text_token_list_train=df_train['mission_prgrm_spellchk']
text_token_list_test=df_test['mission_prgrm_spellchk']

In [9]:
# Build word index for train and validation texts.
tokenizer=Tokenizer()
tokenizer.fit_on_texts(text_token_list_train.to_list()+text_token_list_test.to_list())
print(list(tokenizer.word_index.items())[0:5])

[('and', 1), ('the', 2), ('to', 3), ('of', 4), ('in', 5)]


In [10]:
# Text to sequences.
seq_encoding_text_train=tokenizer.texts_to_sequences(text_token_list_train)
seq_encoding_text_test=tokenizer.texts_to_sequences(text_token_list_test)

# Pads sequences to the same length (i.e., prepare matrix).
x_train=pad_sequences(sequences=seq_encoding_text_train,
                      maxlen=max([len(s) for s in seq_encoding_text_train]), # Max length of the sequence.
                      dtype = "int32", padding = "post", truncating = "post", 
                      value = 0 # Zero is used for representing None or Unknown.
                     )

x_test=pad_sequences(sequences=seq_encoding_text_test,
                    maxlen=max([len(s) for s in seq_encoding_text_train]), # Max length of the sequence.
                    dtype = "int32", padding = "post", truncating = "post", 
                    value = 0 # Zero is used for representing None or Unknown.
                     )

### Resample.

In [11]:
# Convert to Compressed Sparse Row matrix; otherwise, matrix too large, result memory error.
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
from scipy.sparse import csr_matrix
x_train=csr_matrix(x_train)

# Define resample strategy.
def func_resample(method, sampling_strategy, x_train_vect, y_train):
    if method=='ADASYN':
        from imblearn.over_sampling import ADASYN
        resample = ADASYN(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='RandomOverSampler':
        from imblearn.over_sampling import RandomOverSampler
        resample = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='SMOTE':
        from imblearn.over_sampling import SMOTE
        resample = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='SMOTEENN':
        from imblearn.combine import SMOTEENN
        resample = SMOTEENN(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='SMOTETomek':
        from imblearn.combine import SMOTETomek
        resample = SMOTETomek(sampling_strategy=sampling_strategy, random_state=42)
    x_train_vect_res, y_train_res = resample.fit_resample(x_train_vect, y_train)
    return [x_train_vect_res, y_train_res]

x_train_res, y_train_res = func_resample(method='ADASYN', sampling_strategy='minority', 
                                         x_train_vect=x_train, y_train=y_train)

# x_train_res, y_train_res = [x_train, y_train]

### Prepare embedding layer.
Use pre-trained GloVe.

In [12]:
import gensim.downloader as api
EMBEDDING_DIM=100
glove_word_vector=api.load('glove-wiki-gigaword-'+str(EMBEDDING_DIM))

In [13]:
embedding_matrix = np.zeros((len(tokenizer.word_index)+1, EMBEDDING_DIM))

In [14]:
for word, index in tqdm(tokenizer.word_index.items()):
    try:
        embedding_matrix[index] = glove_word_vector.get_vector(word)
    except:
        pass
        # words not found in embedding index will be all-zeros.

100%|██████████| 140908/140908 [00:00<00:00, 393936.68it/s]


In [15]:
# https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, # Size of vocabulary.
                            input_length=max([len(s) for s in seq_encoding_text_train]), # Length of input, i.e., length of padded sequence.
                            output_dim=EMBEDDING_DIM, # Size of the vector space in which words will be embedded.
                            weights=[embedding_matrix],
                            trainable=False
                           )

### Stochastic tuning of training params.

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, GlobalMaxPool1D, Conv1D
from keras.layers.advanced_activations import LeakyReLU, PReLU

with tf.device('/gpu:1'): # Specify which GPU to use.
    # define the model
    model = Sequential()
    model.add(embedding_layer)
    # model.add(Flatten())
    model.add(Conv1D(128, 5, activation='softplus'))
    model.add(GlobalMaxPool1D())
    model.add(Dense(units=32, activation='sigmoid'))
    model.add(Dense(units=16, activation='softplus'))
    # model.add(PReLU()) # https://medium.com/tinymind/a-practical-guide-to-relu-b83ca804f1f7
    model.add(Dense(units=16, activation='tanh'))
    model.add(Dense(units=16, activation='softplus'))
    model.add(Dense(units=len(y_train[0]), activation='softmax'))
    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc', 
    #                                                                      precision, recall
                                                                        ])
    # summarize the model
    print(model.summary())

    # fit the model
    history=model.fit(x_train, y_train, validation_split=0.3, epochs=20, verbose=1)

### Try Grid Search.

In [17]:
# Continue previous work.
df_history=pd.read_csv('../../output/grid_search_history_major_group.tsv', sep='\t', index_col=0)

In [18]:
param_list_done=set(map(tuple, 
                        df_history[['conv_num_filters', 'conv_kernel_size', 'conv_act', 'out_act']].values.tolist()
                       )
                   )

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, GlobalMaxPool1D, Conv1D
from datetime import datetime

# df_history=pd.DataFrame()
for num_filters in [32, 64, 128]:
    for kernel_size in [3,5,7]:
        for conv_act in ['sigmoid', 'softplus', 'tanh', 'softmax']:
            for out_act in ['sigmoid', 'softplus', 'tanh', 'softmax']:
                param=tuple((num_filters, kernel_size, conv_act, out_act))
                if param not in param_list_done:
                    t1=datetime.now()
                    # Run NN on a specified GPU.
                    with tf.device('/device:GPU:1'):
                        # define the model
                        model = Sequential()
                        model.add(embedding_layer)
                        # model.add(Flatten())
                        model.add(Conv1D(num_filters, kernel_size, activation=conv_act))
                        model.add(GlobalMaxPool1D())
                        model.add(Dense(units=32, activation='sigmoid'))
                        model.add(Dense(units=32, activation='softplus'))
                        model.add(Dense(units=16, activation='tanh'))
                        model.add(Dense(units=16, activation='softplus'))
                        model.add(Dense(units=len(y_train[0]), activation=out_act))
                        # compile the model
                        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
                        # F1, precision, and recall removed. https://github.com/keras-team/keras/issues/5794
                        # fit the model
                        history=model.fit(x_train, y_train, validation_split=0.2, epochs=50, verbose=0)
                        y_prob = model.predict(x_val, verbose=0)
                    # Save history.
                    acc = history.history['acc']
                    val_acc = history.history['val_acc']
                    loss = history.history['loss']
                    val_loss = history.history['val_loss']
                    epochs = range(1, len(acc) + 1)
                    # Calculate on validation dataset.
                    y_classes = y_prob.argmax(axis=-1)
                    y_classes_prob=[s.max() for s in y_prob]
                    y_classes_val=y_val.argmax(axis=-1)
                    df_val=pd.DataFrame({'pred':y_classes, 
                                         'true':y_classes_val, 
                                         'prob':y_classes_prob})
                    val_acc_real=len(df_val[df_val.pred==df_val.true])/len(df_val)
                    # Save history to datafame.
                    df_history_temp=pd.DataFrame()
                    df_history_temp['acc']=acc
                    df_history_temp['val_acc']=val_acc
                    df_history_temp['val_acc_real']=[math.nan]*(len(epochs)-1)+[val_acc_real]
                    df_history_temp['loss']=loss
                    df_history_temp['val_loss']=val_loss
                    df_history_temp['epochs']=epochs
                    df_history_temp['conv_num_filters']=[num_filters]*len(epochs)
                    df_history_temp['conv_kernel_size']=[kernel_size]*len(epochs)
                    df_history_temp['conv_act']=[conv_act]*len(epochs)
                    df_history_temp['out_act']=[out_act]*len(epochs)
                    df_history_temp['time_stamp']=[str(t1)]+[math.nan]*(len(epochs)-2)+[str(datetime.now())]
                    df_history=df_history.append(df_history_temp, ignore_index=True)
                    df_history.to_csv('../../output/grid_search_history_major_group.tsv', sep='\t')

### Decision making: Optimizing.

In [17]:
# Continue previous work.
try:
    df_history=pd.read_csv('../../output/grid_search_history_major_group_optimizing.tsv', sep='\t', index_col=0)
except:
    df_history=pd.DataFrame(columns=pd.read_csv('../../output/grid_search_history_major_group.tsv', sep='\t', index_col=0).columns)

In [None]:
param_list_done=set(map(tuple, 
                        df_history[['conv_num_filters', 'conv_kernel_size', 'conv_act', 'out_act']].values.tolist()
                       )
                   )

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, GlobalMaxPool1D, Conv1D
from datetime import datetime

# df_history=pd.DataFrame()
for num_filters in [128, 256, 512, 1024]:
    for kernel_size in [3]:
        for conv_act in ['softplus']:
            for out_act in ['sigmoid', 'softplus', 'softmax']:
                param=tuple((num_filters, kernel_size, conv_act, out_act))
                if param not in param_list_done:
                    t1=datetime.now()
                    # Run NN on a specified GPU.
                    with tf.device('/device:GPU:1'):
                        # define the model
                        model = Sequential()
                        model.add(embedding_layer)
                        # model.add(Flatten())
                        model.add(Conv1D(num_filters, kernel_size, activation=conv_act))
                        model.add(GlobalMaxPool1D())
                        model.add(Dense(units=32, activation='sigmoid'))
                        model.add(Dense(units=32, activation='softplus'))
                        model.add(Dense(units=16, activation='tanh'))
                        model.add(Dense(units=16, activation='softplus'))
                        model.add(Dense(units=len(y_train[0]), activation=out_act))
                        # compile the model
                        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
                        # F1, precision, and recall removed. https://github.com/keras-team/keras/issues/5794
                        # fit the model
                        history=model.fit(x_train, y_train, validation_split=0.2, epochs=50, verbose=0)
                        y_prob = model.predict(x_val, verbose=0)
                    # Save history.
                    acc = history.history['acc']
                    val_acc = history.history['val_acc']
                    loss = history.history['loss']
                    val_loss = history.history['val_loss']
                    epochs = range(1, len(acc) + 1)
                    # Calculate on validation dataset.
                    y_classes = y_prob.argmax(axis=-1)
                    y_classes_prob=[s.max() for s in y_prob]
                    y_classes_val=y_val.argmax(axis=-1)
                    df_val=pd.DataFrame({'pred':y_classes, 
                                         'true':y_classes_val, 
                                         'prob':y_classes_prob})
                    val_acc_real=len(df_val[df_val.pred==df_val.true])/len(df_val)
                    # Save history to datafame.
                    df_history_temp=pd.DataFrame()
                    df_history_temp['acc']=acc
                    df_history_temp['val_acc']=val_acc
                    df_history_temp['val_acc_real']=[math.nan]*(len(epochs)-1)+[val_acc_real]
                    df_history_temp['loss']=loss
                    df_history_temp['val_loss']=val_loss
                    df_history_temp['epochs']=epochs
                    df_history_temp['conv_num_filters']=[num_filters]*len(epochs)
                    df_history_temp['conv_kernel_size']=[kernel_size]*len(epochs)
                    df_history_temp['conv_act']=[conv_act]*len(epochs)
                    df_history_temp['out_act']=[out_act]*len(epochs)
                    df_history_temp['time_stamp']=[str(t1)]+[math.nan]*(len(epochs)-2)+[str(datetime.now())]
                    df_history=df_history.append(df_history_temp, ignore_index=True)
                    df_history.to_csv('../../output/grid_search_history_major_group_optimizing.tsv', sep='\t')

### Train model finalist

**Best configuration**

_Broad Category_

|acc | val_acc | val_acc_real | loss | val_loss | epochs | conv_num_filters | conv_kernel_size | conv_act | out_act|
|--|--|--|--|--|--|--|--|--|--|
|0.820386905 | 0.776488095 | -- | 0.613613255 | 0.738004138 | 4 | 512 | 3 | softplus | softplus|

_Major Group_

|acc | val_acc | val_acc_real | loss | val_loss | epochs | conv_num_filters | conv_kernel_size | conv_act | out_act|
|--|--|--|--|--|--|--|--|--|--|
|0.764369048 | 0.710428571 | -- | 0.888776311 | 1.120441045 | 6 | 256 | 3 | softplus | softmax|
|0.779 | 0.71 | -- | 0.834419103 | 1.140895644 | 7 | 256 | 3 | softplus | softplus|

Use `softplus`+`softplus`.

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, GlobalMaxPool1D, Conv1D
from datetime import datetime

with tf.device('/device:GPU:1'):
    # define the model
    model = Sequential()
    model.add(embedding_layer)
    # model.add(Flatten())
    model.add(Conv1D(512, 3, activation='softplus'))
    model.add(GlobalMaxPool1D())
    model.add(Dense(units=32, activation='sigmoid'))
    model.add(Dense(units=32, activation='softplus'))
    model.add(Dense(units=16, activation='tanh'))
    model.add(Dense(units=16, activation='softplus'))
    model.add(Dense(units=len(y_train[0]), activation='softplus'))
    # compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
    # F1, precision, and recall removed. https://github.com/keras-team/keras/issues/5794
    # fit the model
    history=model.fit(x_train_res, y_train_res, validation_split=0.2, epochs=7, verbose=1)
    print('Test on UCF-Testing')
    y_prob = model.predict(x_test, verbose=1)

Train on 143931 samples, validate on 35983 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Test on UCF-Testing


In [17]:
# Calculate on validation dataset.
# From probability --> serial coding, e.g., [4, 3, 2, 55] --> categorical, e.g., [[00010...], [001000..]...]
y_pred = lb.inverse_transform(np_utils.to_categorical(y_prob.argmax(axis=-1)))
y_pred_prob=[s.max() for s in y_prob]
df_val=pd.DataFrame({'pred':y_pred, 
                     'true':df_test['NTEE1'], 
                     'prob':y_pred_prob})
print('Overall ACC:', len(df_val[df_val.pred==df_val.true])/len(df_val))

Overall ACC: 0.7773719791747611


In [19]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=df_test['NTEE1'], y_pred=y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          A       0.86      0.84      0.98      0.85      0.91      0.81      4291
          B       0.85      0.86      0.97      0.85      0.91      0.82      6419
          C       0.71      0.65      0.99      0.68      0.80      0.63       827
          D       0.86      0.89      1.00      0.87      0.94      0.87      1034
          E       0.76      0.78      0.98      0.77      0.88      0.75      2307
          F       0.48      0.60      0.99      0.53      0.77      0.57       543
          G       0.68      0.60      0.99      0.64      0.77      0.57      1353
          H       0.45      0.04      1.00      0.07      0.20      0.04       126
          I       0.62      0.72      0.99      0.66      0.85      0.70       740
          J       0.73      0.75      0.99      0.74      0.86      0.73      1132
          K       0.75      0.67      1.00      0.71      0.82      0.64       522
   

```Python
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=df_test['broad_cat'], y_pred=y_train))
```
**Chosen:** *epochs=7, resampling: method='ADASYN', sampling_strategy='minority'.*

    ***Overall ACC: 0.7821897583339809 # This is chosen.***
                       pre       rec       spe        f1       geo       iba       sup

              A       0.80      0.87      0.97      0.83      0.92      0.83      4291
              B       0.85      0.85      0.97      0.85      0.91      0.82      6419
              C       0.65      0.74      0.99      0.69      0.86      0.72       827
              D       0.80      0.90      0.99      0.85      0.94      0.88      1034
              E       0.77      0.78      0.98      0.78      0.88      0.76      2307
              F       0.51      0.60      0.99      0.55      0.77      0.57       543
              G       0.68      0.68      0.99      0.68      0.82      0.65      1353
              H       0.55      0.19      1.00      0.28      0.44      0.17       126
              I       0.71      0.71      0.99      0.71      0.84      0.68       740
              J       0.86      0.67      1.00      0.75      0.82      0.65      1132
              K       0.63      0.68      0.99      0.66      0.82      0.66       522
              L       0.70      0.76      0.99      0.73      0.87      0.73      1537
              M       0.87      0.90      1.00      0.88      0.95      0.89      1140
              N       0.83      0.93      0.98      0.88      0.95      0.90      3925
              O       0.65      0.61      1.00      0.63      0.78      0.59       409
              P       0.64      0.57      0.98      0.60      0.75      0.53      2318
              Q       0.43      0.36      0.99      0.39      0.60      0.33       436
              R       0.46      0.21      1.00      0.28      0.45      0.19       257
              S       0.84      0.79      0.98      0.81      0.88      0.76      3603
              T       0.66      0.32      1.00      0.43      0.56      0.30       541
              U       0.52      0.22      1.00      0.31      0.47      0.20       225
              V       0.00      0.00      1.00      0.00      0.00      0.00        85
              W       0.87      0.86      0.99      0.86      0.92      0.84      2038
              X       0.68      0.71      0.99      0.70      0.84      0.69      1098
              Y       0.84      0.91      0.99      0.88      0.95      0.90      1701

    avg / total       0.78      0.78      0.98      0.78      0.87      0.76     38607

*epochs=4, resampling: method='ADASYN', sampling_strategy='not majority'.*
    
    Overall ACC: 0.7046390550936359
                       pre       rec       spe        f1       geo       iba       sup

              A       0.79      0.83      0.97      0.81      0.90      0.80      4291
              B       0.86      0.81      0.97      0.83      0.89      0.77      6419
              C       0.69      0.52      0.99      0.59      0.72      0.49       827
              D       0.77      0.86      0.99      0.81      0.92      0.84      1034
              E       0.79      0.64      0.99      0.71      0.80      0.62      2307
              F       0.00      0.00      1.00      0.00      0.00      0.00       543
              G       0.36      0.74      0.95      0.49      0.84      0.69      1353
              H       0.02      0.02      1.00      0.02      0.15      0.02       126
              I       0.24      0.31      0.98      0.27      0.55      0.29       740
              J       0.49      0.71      0.98      0.58      0.83      0.67      1132
              K       0.49      0.57      0.99      0.53      0.75      0.54       522
              L       0.67      0.72      0.99      0.69      0.84      0.69      1537
              M       0.85      0.89      1.00      0.87      0.94      0.88      1140
              N       0.87      0.84      0.99      0.86      0.91      0.81      3925
              O       0.41      0.47      0.99      0.44      0.69      0.45       409
              P       0.44      0.65      0.95      0.52      0.78      0.60      2318
              Q       0.08      0.00      1.00      0.00      0.05      0.00       436
              R       0.03      0.00      1.00      0.01      0.06      0.00       257
              S       0.83      0.71      0.99      0.77      0.84      0.68      3603
              T       0.50      0.19      1.00      0.27      0.43      0.17       541
              U       0.00      0.00      1.00      0.00      0.00      0.00       225
              V       0.00      0.00      1.00      0.00      0.00      0.00        85
              W       0.90      0.75      1.00      0.82      0.86      0.73      2038
              X       0.60      0.62      0.99      0.61      0.79      0.59      1098
              Y       0.89      0.79      1.00      0.83      0.89      0.77      1701

    avg / total       0.71      0.70      0.98      0.70      0.81      0.68     38607


*epochs=7, no resampling*

    Overall ACC: 0.7776309995596653
                       pre       rec       spe        f1       geo       iba       sup

              A       0.85      0.85      0.98      0.85      0.91      0.82      4291
              B       0.85      0.86      0.97      0.85      0.91      0.82      6419
              C       0.57      0.76      0.99      0.65      0.86      0.73       827
              D       0.78      0.90      0.99      0.83      0.94      0.88      1034
              E       0.77      0.76      0.99      0.76      0.87      0.73      2307
              F       0.70      0.37      1.00      0.48      0.61      0.35       543
              G       0.56      0.73      0.98      0.64      0.85      0.70      1353
              H       0.00      0.00      1.00      0.00      0.00      0.00       126
              I       0.78      0.63      1.00      0.70      0.79      0.61       740
              J       0.78      0.73      0.99      0.75      0.85      0.71      1132
              K       0.67      0.69      1.00      0.68      0.83      0.66       522
              L       0.78      0.71      0.99      0.75      0.84      0.69      1537
              M       0.86      0.89      1.00      0.87      0.94      0.88      1140
              N       0.89      0.90      0.99      0.89      0.94      0.88      3925
              O       0.75      0.56      1.00      0.65      0.75      0.54       409
              P       0.56      0.69      0.96      0.61      0.81      0.65      2318
              Q       0.37      0.39      0.99      0.38      0.63      0.37       436
              R       0.60      0.30      1.00      0.40      0.55      0.28       257
              S       0.78      0.81      0.98      0.79      0.89      0.78      3603
              T       0.55      0.43      0.99      0.48      0.65      0.40       541
              U       0.35      0.19      1.00      0.24      0.43      0.17       225
              V       0.00      0.00      1.00      0.00      0.00      0.00        85
              W       0.89      0.82      0.99      0.85      0.90      0.81      2038
              X       0.77      0.66      0.99      0.71      0.81      0.63      1098
              Y       0.91      0.83      1.00      0.87      0.91      0.82      1701

    avg / total       0.78      0.78      0.98      0.77      0.87      0.75     38607

*epochs=4, resampling: method='ADASYN', sampling_strategy='minority'.*

    Overall ACC: 0.7663895148548191
                       pre       rec       spe        f1       geo       iba       sup

              A       0.86      0.83      0.98      0.84      0.90      0.80      4291
              B       0.86      0.85      0.97      0.86      0.91      0.82      6419
              C       0.56      0.81      0.99      0.66      0.89      0.79       827
              D       0.86      0.88      1.00      0.87      0.94      0.87      1034
              E       0.84      0.71      0.99      0.77      0.84      0.68      2307
              F       0.45      0.62      0.99      0.52      0.78      0.59       543
              G       0.60      0.64      0.98      0.62      0.79      0.61      1353
              H       0.00      0.00      1.00      0.00      0.00      0.00       126
              I       0.60      0.73      0.99      0.66      0.85      0.70       740
              J       0.79      0.71      0.99      0.75      0.84      0.69      1132
              K       0.66      0.75      0.99      0.70      0.87      0.73       522
              L       0.74      0.76      0.99      0.75      0.86      0.73      1537
              M       0.93      0.85      1.00      0.89      0.92      0.83      1140
              N       0.93      0.86      0.99      0.90      0.92      0.84      3925
              O       0.51      0.59      0.99      0.55      0.77      0.57       409
              P       0.49      0.72      0.95      0.58      0.83      0.67      2318
              Q       0.27      0.18      0.99      0.22      0.42      0.17       436
              R       0.34      0.21      1.00      0.26      0.46      0.19       257
              S       0.88      0.75      0.99      0.81      0.86      0.72      3603
              T       0.41      0.06      1.00      0.10      0.24      0.05       541
              U       0.35      0.03      1.00      0.05      0.16      0.02       225
              V       0.00      0.00      1.00      0.00      0.00      0.00        85
              W       0.85      0.84      0.99      0.85      0.91      0.82      2038
              X       0.52      0.83      0.98      0.64      0.90      0.80      1098
              Y       0.89      0.87      1.00      0.88      0.93      0.86      1701

    avg / total       0.77      0.77      0.98      0.76      0.86      0.74     38607

    ???    
                       pre       rec       spe        f1       geo       iba       sup

              A       0.91      0.79      0.99      0.85      0.89      0.77      4291
              B       0.82      0.89      0.96      0.86      0.93      0.85      6419
              C       0.63      0.72      0.99      0.67      0.84      0.69       827
              D       0.77      0.91      0.99      0.83      0.95      0.90      1034
              E       0.71      0.83      0.98      0.77      0.90      0.80      2307
              F       0.51      0.62      0.99      0.56      0.78      0.59       543
              G       0.66      0.69      0.99      0.68      0.82      0.66      1353
              H       0.38      0.06      1.00      0.11      0.25      0.06       126
              I       0.68      0.73      0.99      0.70      0.85      0.70       740
              J       0.81      0.72      0.99      0.76      0.84      0.69      1132
              K       0.73      0.62      1.00      0.67      0.79      0.60       522
              L       0.70      0.76      0.99      0.73      0.87      0.73      1537
              M       0.87      0.89      1.00      0.88      0.94      0.88      1140
              N       0.89      0.88      0.99      0.89      0.93      0.86      3925
              O       0.62      0.64      1.00      0.63      0.80      0.61       409
              P       0.59      0.62      0.97      0.61      0.78      0.58      2318
              Q       0.43      0.26      1.00      0.33      0.51      0.24       436
              R       0.49      0.29      1.00      0.36      0.54      0.27       257
              S       0.83      0.79      0.98      0.81      0.88      0.76      3603
              T       0.54      0.44      0.99      0.49      0.66      0.42       541
              U       0.34      0.12      1.00      0.18      0.35      0.11       225
              V       0.00      0.00      1.00      0.00      0.00      0.00        85
              W       0.89      0.83      0.99      0.86      0.91      0.81      2038
              X       0.68      0.71      0.99      0.70      0.84      0.69      1098
              Y       0.84      0.89      0.99      0.86      0.94      0.87      1701

    avg / total       0.78      0.78      0.98      0.78      0.87      0.75     38607
    
*epochs=4, no resampling*

    Overall ACC: 0.7731499469008211
                       pre       rec       spe        f1       geo       iba       sup

              A       0.84      0.85      0.98      0.85      0.91      0.83      4291
              B       0.87      0.85      0.97      0.86      0.91      0.82      6419
              C       0.60      0.77      0.99      0.67      0.87      0.75       827
              D       0.89      0.89      1.00      0.89      0.94      0.88      1034
              E       0.77      0.79      0.99      0.78      0.88      0.76      2307
              F       0.50      0.35      1.00      0.41      0.59      0.32       543
              G       0.66      0.66      0.99      0.66      0.81      0.63      1353
              H       0.00      0.00      1.00      0.00      0.00      0.00       126
              I       0.59      0.69      0.99      0.64      0.83      0.66       740
              J       0.78      0.71      0.99      0.74      0.84      0.68      1132
              K       0.47      0.78      0.99      0.59      0.88      0.76       522
              L       0.81      0.70      0.99      0.75      0.83      0.67      1537
              M       0.92      0.86      1.00      0.89      0.93      0.85      1140
              N       0.90      0.88      0.99      0.89      0.94      0.87      3925
              O       0.57      0.47      1.00      0.52      0.69      0.45       409
              P       0.53      0.67      0.96      0.59      0.80      0.62      2318
              Q       0.40      0.16      1.00      0.23      0.40      0.15       436
              R       0.33      0.10      1.00      0.16      0.32      0.09       257
              S       0.75      0.83      0.97      0.79      0.90      0.79      3603
              T       0.51      0.37      0.99      0.43      0.61      0.35       541
              U       0.83      0.02      1.00      0.04      0.15      0.02       225
              V       0.00      0.00      1.00      0.00      0.00      0.00        85
              W       0.90      0.80      1.00      0.85      0.89      0.78      2038
              X       0.62      0.78      0.99      0.69      0.87      0.75      1098
              Y       0.88      0.89      0.99      0.89      0.94      0.88      1701

    avg / total       0.77      0.77      0.98      0.77      0.86      0.75     38607

### Save model for developing package.

In [20]:
model.save('../../output/major_group_model.h5')

In [21]:
# Check the saved models.
from keras.models import load_model
with tf.device('/device:GPU:1'): # Specify the GPU# to run if OOM errors.
    model_major_group=load_model('../../output/major_group_model.h5')
print(model_major_group.summary(), model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 46612, 100)        14090900  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 46610, 512)        154112    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                16416     
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_4 (Dense)              (None, 16)                272       
__________