In [2]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
from multiprocessing import Pool

from imblearn.metrics import geometric_mean_score as gmean
from imblearn.metrics import make_index_balanced_accuracy as iba
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

### Read in and prepare data.

In [2]:
train_file_path='../../dataset/df_ntee_universal/train/'
file_list=os.listdir(train_file_path)
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, pd.read_pickle(train_file_path+file, compression='gzip')])
len(df_train)

154424

In [3]:
df_train['mission_prgrm']=df_train['TAXPAYER_NAME']+' '+df_train['mission']+' '+df_train['prgrm_dsc']
df_train['mission_prgrm_spellchk']=df_train['TAXPAYER_NAME']+' '+df_train['mission_spellchk']+' '+df_train['prgrm_dsc_spellchk'] # Using spell-checked.
len(df_train['mission_prgrm_spellchk']), len(df_train['NTEE1'].drop_duplicates())

(154424, 25)

In [4]:
# # Check if the sampling criteria can be satisfied.
# small_num=0
# while small_num<500: # Make sure each category in training dataset has at least 500 records.
#     trainDF, valDF = model_selection.train_test_split(df_train, test_size=.2)
#     small_num=trainDF.groupby('broad_cat').count().sort_values('EIN').iloc[0]['EIN']

trainDF, testDF = model_selection.train_test_split(df_train, test_size=.2)
# See the composition by broad category.
print(trainDF.groupby('NTEE1').count()['EIN'], '\n'*2, testDF.groupby('NTEE1').count()['EIN'])

NTEE1
A    13692
B    20649
C     2647
D     3361
E     7175
F     1868
G     4068
H      387
I     2355
J     3780
K     1604
L     4765
M     3726
N    12318
O     1372
P     7312
Q     1606
R      851
S    11573
T     1631
U      785
V      274
W     6684
X     3685
Y     5371
Name: EIN, dtype: int64 

 NTEE1
A    3318
B    5178
C     676
D     878
E    1840
F     433
G     985
H      80
I     592
J     992
K     405
L    1177
M     967
N    3142
O     359
P    1868
Q     381
R     213
S    2886
T     401
U     215
V      76
W    1673
X     881
Y    1269
Name: EIN, dtype: int64


### Prepare classifier function

In [9]:
# Redefine vectorizer.
def text_vectorizer(tokenizer_type=None, vectorizer_type=None):
    ########################################################
    ######### Define and choose tokenizers #################
    def porter_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]
    # Lemmatize using POS tags, assume improving accuracy.
    # Ref: 
    #   - https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    #   - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    def lemma_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
        tokens=word_tokenize(str_input)
        return [WordNetLemmatizer().lemmatize(word=word, pos=get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(tokens)]
    # Choose tokenizer using parameter passed.
    if tokenizer_type=='lemma':
        tokenizer=lemma_tokenizer
    elif tokenizer_type=='porter':
        tokenizer=porter_tokenizer
    ########################################################
    ######### Define and choose vectorizer #################
    # 1. Use word level, character level does not make sense for current situation.
    # 2. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    if vectorizer_type=='count':
        ##### Token counts #####
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        return vectorizer
    elif vectorizer_type=='tfidf':
        ##### TF-IDF #####
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        return vectorizer
    ########################################################

# Define resample strategy.
def func_resample(method, sampling_strategy, x_train_vect, y_train, categorical_features):
    if method=='ADASYN':
        from imblearn.over_sampling import ADASYN
        resample = ADASYN(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='RandomOverSampler':
        from imblearn.over_sampling import RandomOverSampler
        resample = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='SMOTE':
        from imblearn.over_sampling import SMOTE
        resample = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
    # SMOTENC not used.
    # --> 947         X_continuous = check_array(X_continuous, accept_sparse=['csr', 'csc'])
    # ...
    # ValueError: Found array with 0 feature(s) (shape=(123539, 0)) while a minimum of 1 is required.
    # Must have continuous feature, but all features (columns) in x_train_vect are specified as categorical.
#     elif method=='SMOTENC':
#         from imblearn.over_sampling import SMOTENC
#         resample = SMOTENC(sampling_strategy=sampling_strategy, random_state=42, categorical_features=categorical_features)
    elif method=='SMOTEENN':
        from imblearn.combine import SMOTEENN
        resample = SMOTEENN(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='SMOTETomek':
        from imblearn.combine import SMOTETomek
        resample = SMOTETomek(sampling_strategy=sampling_strategy, random_state=42)
    x_train_vect_res, y_train_res = resample.fit_resample(x_train_vect, y_train)
    return [x_train_vect_res, y_train_res]

# Compile the workflow as a function.
def func_classifier(param_list):
    global gmean, iba, lb
    # Pass parameters.
    x_train, y_train, x_test, y_test, resample_method, sampling_strategy, classifier, tokenizer, vect_type = param_list
    # Encode text input.
    vectorizer=text_vectorizer(tokenizer_type=tokenizer, vectorizer_type=vect_type)
    vectorizer.fit(x_train.append(x_test)) # Fit on all texts.
    x_train_vect=vectorizer.transform(x_train)
    x_test_vect=vectorizer.transform(x_test)
    # Encode class labels -- Not necessary for NB/RF algorithms: https://stats.stackexchange.com/questions/288095/what-algorithms-require-one-hot-encoding
    # See test results below.
#     y_train_vect=le.fit_transform(y_train)
#     y_test_vect=le.fit_transform(y_test)
    # Resample imbalanced dataset.
    categorical_features=list(range(0, x_train_vect.shape[1])) # All indices are categorical.
    resample_x_y=func_resample(method=resample_method, sampling_strategy=sampling_strategy,
                               x_train_vect=x_train_vect, y_train=y_train, categorical_features=categorical_features)
    classifier.fit(resample_x_y[0], resample_x_y[1])
    predictions = classifier.predict(x_test_vect)
    gmean = iba(alpha=0.1, squared=True)(gmean)

    return {'resample_method':resample_method,
            'sampling_strategy':sampling_strategy,
            'classifier':str(classifier), 
            'tokenizer':tokenizer, 
            'vect_type':vect_type, 
            'weighted_acc': gmean(y_true=y_test, y_pred=predictions, average='weighted')
           }

In [6]:
param_llist=[]
x_train, y_train, x_test, y_test = [trainDF['mission_prgrm_spellchk'], trainDF['NTEE1'],
                                    testDF['mission_prgrm_spellchk'], testDF['NTEE1']]
for resample_method in ['ADASYN', 'RandomOverSampler', 'SMOTE', 'SMOTEENN', 'SMOTETomek']:
    for sampling_strategy in ['minority','not minority','not majority','all']:
        for classifier in [naive_bayes.MultinomialNB(), naive_bayes.ComplementNB(), ensemble.RandomForestClassifier()]:
            for tokenizer in ['lemma', 'porter']:
                for vect_type in ['count', 'tfidf']:
                    param_llist+=[[x_train, y_train, x_test, y_test, resample_method, sampling_strategy, classifier, tokenizer, vect_type]]
print('total parameter combinations:', len(param_llist))

total parameter combinations: 288


In [7]:
param_llist_1, param_llist_2, param_llist_3, param_llist_4, param_llist_5 = [param_llist[0:48], param_llist[48:48*2], param_llist[48*2:48*3], 
                                                                             param_llist[48*3:48*4], param_llist[48*4:48*5],
                                                                            ]

In [8]:
p=Pool(48)
result_dicts=p.map(func_classifier, param_llist_1)

In [9]:
pd.DataFrame(result_dicts).to_pickle('../../output/df_result_dicts_major_group_1.pkl')
pd.DataFrame(result_dicts)

Unnamed: 0,classifier,resample_method,sampling_strategy,tokenizer,vect_type,weighted_acc
0,"MultinomialNB(alpha=1.0, class_prior=None, fit...",ADASYN,minority,lemma,count,0.68865
1,"MultinomialNB(alpha=1.0, class_prior=None, fit...",ADASYN,minority,lemma,tfidf,0.500933
2,"MultinomialNB(alpha=1.0, class_prior=None, fit...",ADASYN,minority,porter,count,0.68632
3,"MultinomialNB(alpha=1.0, class_prior=None, fit...",ADASYN,minority,porter,tfidf,0.487608
4,"ComplementNB(alpha=1.0, class_prior=None, fit_...",ADASYN,minority,lemma,count,0.706334
5,"ComplementNB(alpha=1.0, class_prior=None, fit_...",ADASYN,minority,lemma,tfidf,0.69088
6,"ComplementNB(alpha=1.0, class_prior=None, fit_...",ADASYN,minority,porter,count,0.702545
7,"ComplementNB(alpha=1.0, class_prior=None, fit_...",ADASYN,minority,porter,tfidf,0.688114
8,"RandomForestClassifier(bootstrap=True, class_w...",ADASYN,minority,lemma,count,0.588108
9,"RandomForestClassifier(bootstrap=True, class_w...",ADASYN,minority,lemma,tfidf,0.61203


### Analyze best parameters.

In [5]:
folder_path='../../output/result_dicts/'
result_file_list=[s for s in os.listdir(folder_path) if 'major' in s]

df_results=pd.concat([pd.read_pickle(folder_path+file_name) for file_name in result_file_list], ignore_index=True)
df_results.sort_values('weighted_acc', ascending=False)[0:5]

Unnamed: 0,classifier,resample_method,sampling_strategy,tokenizer,vect_type,weighted_acc
156,"MultinomialNB(alpha=1.0, class_prior=None, fit...",ADASYN,not minority,lemma,count,0.717644
168,"MultinomialNB(alpha=1.0, class_prior=None, fit...",ADASYN,not majority,lemma,count,0.717624
180,"MultinomialNB(alpha=1.0, class_prior=None, fit...",ADASYN,all,lemma,count,0.717624
120,"MultinomialNB(alpha=1.0, class_prior=None, fit...",SMOTE,not majority,lemma,count,0.715368
132,"MultinomialNB(alpha=1.0, class_prior=None, fit...",SMOTE,all,lemma,count,0.715368


### Retrain model using 100% UCF-Training and test on UCF-Testing.

In [6]:
train_file_path='../../dataset/df_ntee_universal/train/'
file_list=os.listdir(train_file_path)
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, pd.read_pickle(train_file_path+file, compression='gzip')])

test_file_path='../../dataset/df_ntee_universal/test/'
file_list=os.listdir(test_file_path)
df_test=pd.DataFrame()
for file in file_list:
    df_test=pd.concat([df_test, pd.read_pickle(test_file_path+file, compression='gzip')])
    
len(df_train), len(df_test)

(154424, 38607)

In [7]:
df_train['mission_prgrm_spellchk']=df_train['TAXPAYER_NAME']+' '+df_train['mission_spellchk']+' '+df_train['prgrm_dsc_spellchk'] # Using spell-checked.
print(len(df_train['mission_prgrm_spellchk']), len(df_train['NTEE1'].drop_duplicates()))

df_test['mission_prgrm_spellchk']=df_test['TAXPAYER_NAME']+' '+df_test['mission_spellchk']+' '+df_test['prgrm_dsc_spellchk'] # Using spell-checked.
print(len(df_test['mission_prgrm_spellchk']), len(df_test['NTEE1'].drop_duplicates()))

154424 25
38607 25


In [8]:
x_train=df_train['mission_prgrm_spellchk']
y_train=df_train['NTEE1']
x_test=df_test['mission_prgrm_spellchk']
y_test=df_test['NTEE1']
resample_method='ADASYN'
sampling_strategy='not minority'
classifier=naive_bayes.MultinomialNB()
tokenizer='lemma'
vect_type='count'
param_list=[x_train, y_train, x_test, y_test, resample_method, sampling_strategy, classifier, tokenizer, vect_type]

In [10]:
func_classifier(param_list)

{'resample_method': 'ADASYN',
 'sampling_strategy': 'not minority',
 'classifier': 'MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)',
 'tokenizer': 'lemma',
 'vect_type': 'count',
 'weighted_acc': 0.7223228010055919}

In [11]:
vectorizer=text_vectorizer(tokenizer_type='lemma', vectorizer_type='count')
vectorizer.fit(x_train.append(x_test)) # Fit on all texts.
x_test_vect=vectorizer.transform(x_test)

In [12]:
predictions = classifier.predict(x_test_vect)

In [13]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_true=y_test, y_pred=predictions))

                   pre       rec       spe        f1       geo       iba       sup

          A       0.87      0.82      0.99      0.84      0.90      0.79      4291
          B       0.85      0.78      0.97      0.81      0.87      0.74      6419
          C       0.62      0.77      0.99      0.69      0.87      0.74       827
          D       0.87      0.89      1.00      0.88      0.94      0.87      1034
          E       0.77      0.69      0.99      0.73      0.83      0.66      2307
          F       0.59      0.55      0.99      0.57      0.74      0.53       543
          G       0.65      0.56      0.99      0.60      0.74      0.53      1353
          H       0.33      0.56      1.00      0.41      0.74      0.53       126
          I       0.63      0.64      0.99      0.63      0.80      0.61       740
          J       0.71      0.77      0.99      0.74      0.88      0.75      1132
          K       0.68      0.67      1.00      0.67      0.82      0.64       522
   

In [28]:
# Overall ACC.
t=pd.DataFrame([predictions, y_test]).T.rename(columns={0:'pred', 1:'true'})
len(t[t.pred==t.true])/len(t)

0.7517289610692361

See `NB_RF_broad_cat.ipynb` for verifying numbers.