In [1]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
from multiprocessing import Pool

from imblearn.metrics import geometric_mean_score as gmean
from imblearn.metrics import make_index_balanced_accuracy as iba
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

### Read in and prepare data.

In [2]:
train_file_path='../../dataset/df_ntee_universal/train/'
file_list=os.listdir(train_file_path)
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, pd.read_pickle(train_file_path+file, compression='gzip')])
len(df_train)

154424

In [3]:
df_train['mission_prgrm']=df_train['mission']+' '+df_train['prgrm_dsc']
df_train['mission_prgrm_spellchk']=df_train['mission_spellchk']+' '+df_train['prgrm_dsc_spellchk'] # Using spell-checked.
len(df_train['mission_prgrm_spellchk']), len(df_train['NTEE1'].drop_duplicates())

(154424, 25)

In [4]:
# # Check if the sampling criteria can be satisfied.
# small_num=0
# while small_num<500: # Make sure each category in training dataset has at least 500 records.
#     trainDF, valDF = model_selection.train_test_split(df_train, test_size=.2)
#     small_num=trainDF.groupby('broad_cat').count().sort_values('EIN').iloc[0]['EIN']

trainDF, testDF = model_selection.train_test_split(df_train, test_size=.2)
# See the composition by broad category.
print(trainDF.groupby('NTEE1').count()['EIN'], '\n'*2, valDF.groupby('NTEE1').count()['EIN'])

NTEE1
A    13586
B    20618
C     2640
D     3430
E     7153
F     1828
G     4032
H      378
I     2349
J     3808
K     1606
L     4824
M     3734
N    12379
O     1379
P     7329
Q     1572
R      842
S    11640
T     1602
U      804
V      290
W     6686
X     3708
Y     5322
Name: EIN, dtype: int64 

 NTEE1
A    3424
B    5209
C     683
D     809
E    1862
F     473
G    1021
H      89
I     598
J     964
K     403
L    1118
M     959
N    3081
O     352
P    1851
Q     415
R     222
S    2819
T     430
U     196
V      60
W    1671
X     858
Y    1318
Name: EIN, dtype: int64


### Prepare classifier function

In [39]:
# Redefine vectorizer.
def text_vectorizer(tokenizer_type=None, vectorizer_type=None):
    ########################################################
    ######### Define and choose tokenizers #################
    def porter_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]
    # Lemmatize using POS tags, assume improving accuracy.
    # Ref: 
    #   - https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    #   - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    def lemma_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
        tokens=word_tokenize(str_input)
        return [WordNetLemmatizer().lemmatize(word=word, pos=get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(tokens)]
    # Choose tokenizer using parameter passed.
    if tokenizer_type=='lemma':
        tokenizer=lemma_tokenizer
    elif tokenizer_type=='porter':
        tokenizer=porter_tokenizer
    ########################################################
    ######### Define and choose vectorizer #################
    # 1. Use word level, character level does not make sense for current situation.
    # 2. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    if vectorizer_type=='count':
        ##### Token counts #####
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        return vectorizer
    elif vectorizer_type=='tfidf':
        ##### TF-IDF #####
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        return vectorizer
    ########################################################

# Define resample strategy.
def func_resample(method, sampling_strategy, x_train_vect, y_train):
    if method=='ADASYN':
        from imblearn.over_sampling import ADASYN
        resample = ADASYN(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='RandomOverSampler':
        from imblearn.over_sampling import RandomOverSampler
        resample = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='SMOTE':
        from imblearn.over_sampling import SMOTE
        resample = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='SMOTENC':
        from imblearn.over_sampling import SMOTENC
        resample = SMOTENC(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='SMOTEENN':
        from imblearn.combine import SMOTEENN
        resample = SMOTEENN(sampling_strategy=sampling_strategy, random_state=42)
    elif method=='SMOTETomek':
        from imblearn.combine import SMOTETomek
        resample = SMOTETomek(sampling_strategy=sampling_strategy, random_state=42)
    x_train_vect_res, y_train_res = resample.fit_resample(x_train_vect, y_train)
    return [x_train_vect_res, y_train_res]

# Compile the workflow as a function.
def func_classifier(param_list):
    global gmean, iba, lb
    # Pass parameters.
    x_train, y_train, x_test, y_test, resample_method, sampling_strategy, classifier, tokenizer, vect_type = param_list
    # Encode text input.
    vectorizer=text_vectorizer(tokenizer_type=tokenizer, vectorizer_type=vect_type)
    vectorizer.fit(x_train.append(x_test)) # Fit on all texts.
    x_train_vect=vectorizer.transform(x_train)
    x_test_vect=vectorizer.transform(x_test)
    # Encode class labels -- Not necessary for NB/RF algorithms: https://stats.stackexchange.com/questions/288095/what-algorithms-require-one-hot-encoding
    # See test results below.
#     y_train_vect=le.fit_transform(y_train)
#     y_test_vect=le.fit_transform(y_test)
    # Resample imbalanced dataset.
    resample_x_y=func_resample(method=resample_method, sampling_strategy=sampling_strategy,
                               x_train_vect=x_train_vect, y_train=y_train)
    classifier.fit(resample_x_y[0], resample_x_y[1])
    predictions = classifier.predict(x_test_vect)
    gmean = iba(alpha=0.1, squared=True)(gmean)

    return {'resample_method':resample_method,
            'sampling_strategy':sampling_strategy,
            'classifier':str(classifier), 
            'tokenizer':tokenizer, 
            'vect_type':vect_type, 
            'weighted_acc': gmean(y_true=y_test, y_pred=predictions, average='weighted')
           }

In [6]:
param_llist=[]
x_train, y_train, x_test, y_test = [trainDF['mission_prgrm_spellchk'], trainDF['NTEE1'],
                                    testDF['mission_prgrm_spellchk'], testDF['NTEE1']]
for resample_method in ['ADASYN', 'RandomOverSampler', 'SMOTE', 'SMOTENC', 'SMOTEENN', 'SMOTETomek']:
    for sampling_strategy in ['minority','not minority','not majority','all']:
        for classifier in [naive_bayes.MultinomialNB(), naive_bayes.ComplementNB(), ensemble.RandomForestClassifier()]:
            for tokenizer in ['lemma', 'porter']:
                for vect_type in ['count', 'tfidf']:
                    param_llist+=[[x_train, y_train, x_test, y_test, resample_method, sampling_strategy, classifier, tokenizer, vect_type]]
print('total parameter combinations:', len(param_llist))

In [None]:
param_llist_0, param_llist_1, param_llist_2, param_llist_3, param_llist_4, param_llist_5 = [param_llist[0:48], param_llist[48:48*2], param_llist[48*2:48*3], 
                                                                                            param_llist[48*3:48*4], param_llist[48*4:48*5], param_llist[48*5:48*6]
                                                                                           ]

In [7]:
p=Pool(48)
result_dicts=p.map(func_classifier, param_llist_0)

### Random forest.

In [16]:
# Generate a list of parameters.
param_llist=[]
for input_text in ['mission', 'prgrm_dsc', 'mission_prgrm', 'mission_spellchk', 'prgrm_dsc_spellchk', 'mission_prgrm_spellchk']:
    for classifier in [ensemble.RandomForestClassifier()]:
        for tokenizer in ['lemma', 'porter']:
            for vect_type in ['count', 'tfidf']:
                for average_mtd in ['macro', 'weighted']:
                    param_llist+=[[input_text, classifier, tokenizer, vect_type, average_mtd]]

In [17]:
p=Pool(24)
df_performance_rf=pd.DataFrame(p.map(func_classifier, param_llist))

### Select model, test on Universal Testing Dataset

In [20]:
df_performance=pd.concat([df_performance_rf, df_performance_nb], ignore_index=True).sort_values(['accuracy', 'f1'], ascending=False)
df_performance[0:10]

Unnamed: 0,accuracy,average_mtd,classifier,f1,input_text,precision,recall,tokenizer,vect_type
139,0.713097,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.697706,mission_prgrm_spellchk,0.71038,0.713097,lemma_tokenizer,tfidf
91,0.713097,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.697509,mission_prgrm,0.709199,0.713097,lemma_tokenizer,tfidf
138,0.713097,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.548333,mission_prgrm_spellchk,0.664668,0.527116,lemma_tokenizer,tfidf
90,0.713097,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.548009,mission_prgrm,0.654455,0.524998,lemma_tokenizer,tfidf
95,0.71096,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.695338,mission_prgrm,0.707266,0.71096,porter_tokenizer,tfidf
94,0.71096,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.545275,mission_prgrm,0.655212,0.52268,porter_tokenizer,tfidf
143,0.710733,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.695284,mission_prgrm_spellchk,0.707968,0.710733,porter_tokenizer,tfidf
142,0.710733,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.545764,mission_prgrm_spellchk,0.664868,0.525096,porter_tokenizer,tfidf
89,0.700372,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.685512,mission_prgrm,0.705112,0.700372,lemma_tokenizer,count
88,0.700372,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.53767,mission_prgrm,0.644781,0.51605,lemma_tokenizer,count


#### Define parameters and datasets

In [5]:
# Define parameters.
input_text='mission_prgrm_spellchk'
classifier=naive_bayes.ComplementNB()
tokenizer='lemma'
vect_type='tfidf'
average_mtd='macro'

df_universal_test=pd.read_pickle('../../dataset/df_ntee_universal/test/df_ntee_universal_test.pkl.gz', compression='gzip')
df_universal_test['mission_prgrm_spellchk']=df_universal_test['mission_spellchk']+' '+df_universal_test['prgrm_dsc_spellchk'] # Using spell-checked.

#### Train best model

In [6]:
##########################################################
################ Prepare dataframe for ML ################
#### Sample ####
# Build training and testing data frame.
x_train=trainDF[input_text]
y_train=trainDF['NTEE1']
################ Prepare dataframe for ML ################
##########################################################

##########################################################
################ Define tokenizer ################

def porter_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
    tokens = word_tokenize(str_input)
    return [PorterStemmer().stem(token) for token in tokens]

# Lemmatize using POS tags, assume to improve accuracy.
# Ref: 
#   - https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
#   - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemma_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
    tokens=word_tokenize(str_input)
    return [WordNetLemmatizer().lemmatize(word=word, pos=get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(tokens)]

if tokenizer=='lemma':
    tokenizer=lemma_tokenizer
elif tokenizer=='porter':
    tokenizer=porter_tokenizer
################ Define tokenizer ################
##########################################################

##########################################################
######### Text Vectorization and Transformation ##########
# 1. Use Porter Stemmer.
# 2. Use word level, character level does not make sense for current situation.
# 3. Use count (freq) and tf-idf vectorizer. see: 
# Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
# Page: 67.

if vect_type=='count':
    ##### Token counts #####
    # create the transform
    vectorizer = CountVectorizer(stop_words='english', 
                                 tokenizer=tokenizer, 
                                 analyzer='word'
                                )
    # tokenize and build vocab.
    vectorizer.fit(x_train) # Using training dataset to build vocabulary.
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect =  vectorizer.transform(x_train)
elif vect_type=='tfidf':
    ##### TF-IDF #####
    # create the transform
    vectorizer = TfidfVectorizer(stop_words='english', 
                                 tokenizer=tokenizer, 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    vectorizer.fit(x_train) # Using training dataset to build vocabulary.
    # Encode document: transform the training and validation data using tfidf vectorizer object
    x_train_vect =  vectorizer.transform(x_train)
######### Text Vectorization and Transformation ##########
##########################################################

classifier.fit(x_train_vect, y_train)

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)

#### Test trained model on Universal Testing

In [8]:
##########################################################
################ Prepare dataframe for ML ################
#### Sample ####
# Build training and testing data frame.
x_valid=df_universal_test[input_text]
y_valid=df_universal_test['NTEE1']
################ Prepare dataframe for ML ################
##########################################################

##########################################################
################ Define tokenizer ################

def porter_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
    tokens = word_tokenize(str_input)
    return [PorterStemmer().stem(token) for token in tokens]

# Lemmatize using POS tags, assume to improve accuracy.
# Ref: 
#   - https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
#   - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemma_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
    tokens=word_tokenize(str_input)
    return [WordNetLemmatizer().lemmatize(word=word, pos=get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(tokens)]

if tokenizer=='lemma':
    tokenizer=lemma_tokenizer
elif tokenizer=='porter':
    tokenizer=porter_tokenizer
################ Define tokenizer ################
##########################################################

##########################################################
######### Text Vectorization and Transformation ##########
# 1. Use Porter Stemmer.
# 2. Use word level, character level does not make sense for current situation.
# 3. Use count (freq) and tf-idf vectorizer. see: 
# Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
# Page: 67.

if vect_type=='count':
    ##### Token counts #####
    # create the transform
    vectorizer = CountVectorizer(stop_words='english', 
                                 tokenizer=tokenizer, 
                                 analyzer='word'
                                )
    # tokenize and build vocab.
    vectorizer.fit(x_train)
    # Encode document: transform the training and validation data using count vectorizer object
    x_valid_vect =  vectorizer.transform(x_valid)
elif vect_type=='tfidf':
    ##### TF-IDF #####
    # create the transform
    vectorizer = TfidfVectorizer(stop_words='english', 
                                 tokenizer=tokenizer, 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    vectorizer.fit(x_train)
    # Encode document: transform the training and validation data using tfidf vectorizer object
    x_valid_vect =  vectorizer.transform(x_valid)
######### Text Vectorization and Transformation ##########
##########################################################

predictions = classifier.predict(x_valid_vect)
performance_dict= {'input_text':input_text,
                   'classifier':str(classifier), 
                   'tokenizer':tokenizer.__name__, 
                   'vect_type':vect_type, 
                   'average_mtd':average_mtd,
                   'accuracy':metrics.accuracy_score(predictions, y_valid), 
                   'precision':metrics.precision_score(y_pred=predictions, y_true=y_valid, average=average_mtd),
                   'recall':metrics.recall_score(y_pred=predictions, y_true=y_valid, average=average_mtd),
                   'f1':metrics.f1_score(y_pred=predictions, y_true=y_valid, average=average_mtd)
                  }

In [9]:
performance_dict

{'input_text': 'mission_prgrm_spellchk',
 'classifier': 'ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)',
 'tokenizer': 'lemma_tokenizer',
 'vect_type': 'tfidf',
 'average_mtd': 'macro',
 'accuracy': 0.716113658144896,
 'precision': 0.659588192865402,
 'recall': 0.5311606825339955,
 'f1': 0.5511202901330786}

In [13]:
pd.DataFrame([predictions, y_valid]).T

Unnamed: 0,0,1
0,L,P
1,A,A
2,N,N
3,A,A
4,S,Y
5,A,A
6,G,G
7,E,E
8,A,C
9,S,S


In [30]:
x_valid.iloc[85]

'PUBLIC EDUCATION AND ENTERTAINMENT PREPARING FOR AN ANNUAL OPERA PERFORMANCE AND SCHOOL EDUCATIONAL PROGRAMS THROUGHOUT THE YEAR .'

In [29]:
x_valid_vect[85]

<1x125816 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [47]:
t=x_valid_vect[32]

In [48]:
t.data

array([0.18100557, 0.22802972, 0.29788442, 0.23999539, 0.19132063,
       0.15383652, 0.05427941, 0.11782835, 0.19442201, 0.16970152,
       0.20433292, 0.34350801, 0.17996997, 0.33130379, 0.15262076,
       0.14241593, 0.13472251, 0.2148029 , 0.25939262, 0.18852346,
       0.1310445 , 0.17414271, 0.10713031, 0.19326427, 0.08294151,
       0.09700927])

In [49]:
classifier.predict(t)

array(['E'], dtype='<U1')

In [51]:
y_valid.iloc[32]

'E'

In [53]:
len(df_universal_test)

38607

In [78]:
sample_size=round(0.2*len(df_universal_test))
df_universal_test_sample=df_universal_test.sample(sample_size, weights=random.choices(range(1, 10000), k=len(df_universal_test)))

In [79]:
import random

In [80]:
len(random.choices(range(1, 100), k=sample_size))

7721

In [81]:
df_universal_test_sample.groupby('NTEE1').count()['EIN']/len(df_universal_test_sample)

NTEE1
A    0.109830
B    0.164616
C    0.020334
D    0.030307
E    0.060355
F    0.014765
G    0.033156
H    0.003108
I    0.018003
J    0.031084
K    0.010750
L    0.041186
M    0.030696
N    0.102707
O    0.011786
P    0.059060
Q    0.011009
R    0.008548
S    0.089237
T    0.015024
U    0.006994
V    0.001684
W    0.053879
X    0.028105
Y    0.043777
Name: EIN, dtype: float64

In [82]:
df_universal_test.groupby('NTEE1').count()['EIN']/len(df_universal_test)

NTEE1
A    0.111146
B    0.166265
C    0.021421
D    0.026783
E    0.059756
F    0.014065
G    0.035045
H    0.003264
I    0.019168
J    0.029321
K    0.013521
L    0.039811
M    0.029528
N    0.101666
O    0.010594
P    0.060041
Q    0.011293
R    0.006657
S    0.093325
T    0.014013
U    0.005828
V    0.002202
W    0.052788
X    0.028440
Y    0.044059
Name: EIN, dtype: float64