In [1]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import nltk
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
len(df_train)

234027

In [3]:
df_train['input_text']=df_train['mission_spellchk']+df_train['prgrm_dsc_spellchk'] # Using spell-checked.
df_train['input_text']=[' '.join(s) for s in df_train['input_text']]
len(df_train['input_text']), len(df_train['NTEE1'].drop_duplicates())

(234027, 25)

In [4]:
# Check if the sampling criteria can be satisfied.
small_num=0
while small_num<200: # Make sure each category has at least 200 records.
    sampleDF = df_train[df_train.input_text.notna() & df_train.NTEE1.notna()].sample(120000)
    trainDF, valDF =train_test_split(sampleDF, test_size=.2)
    small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
# See the composition by broad category.
print(trainDF.groupby('NTEE1').count()['EIN'], '\n'*2, valDF.groupby('NTEE1').count()['EIN'])

NTEE1
A    10597
B    16034
C     2155
D     2689
E     6056
F     1462
G     3212
H      325
I     1765
J     2817
K     1233
L     3703
M     2753
N     9230
O     1042
P     6057
Q     1268
R      730
S     8844
T     1359
U      654
V      224
W     5108
X     2727
Y     3956
Name: EIN, dtype: int64 

 NTEE1
A    2623
B    3942
C     534
D     650
E    1543
F     395
G     870
H      87
I     437
J     752
K     325
L     933
M     687
N    2329
O     274
P    1582
Q     322
R     159
S    2084
T     366
U     146
V      59
W    1232
X     699
Y     970
Name: EIN, dtype: int64


### Prepare parrallel envionment.

In [5]:
import ipyparallel as ipp
c = ipp.Client()
print(c.ids)
dview = c[:]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


In [6]:
dview.execute('from sklearn import model_selection, preprocessing, naive_bayes, metrics')
dview.execute('from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer')
dview.execute('from sklearn import decomposition, ensemble')
dview.execute('from nltk.stem import PorterStemmer')
dview.execute('from nltk import word_tokenize')
dview.execute('from nltk.stem import WordNetLemmatizer')
dview.execute('from nltk.corpus import wordnet')
dview.execute('import pandas as pd')
dview.execute('import nltk')
dview.execute('from sklearn.model_selection import train_test_split')
dview['df_train']=df_train
dview['df_performance']=pd.DataFrame(columns=['trial', 'classifier', 'tokenizer', 'vect_type', 'average_mtd', 'accuracy', 'precision', 'recall', 'f1'])

In [7]:
@dview.parallel(block=True)
def func_naive_bayes(trial):
    global df_train, df_performance, classifier, tokenizer, vect_type, average_mtd

    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    # Build training and testing data frame.
    small_num=0
    while small_num<200: # Make sure each category has at least 500 records.
        sampleDF = df_train[df_train.input_text.notna() & df_train.NTEE1.notna()].sample(120000)
        trainDF, valDF = train_test_split(sampleDF, test_size=.3)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    x_train=trainDF['input_text']
    y_train=trainDF['NTEE1']
    x_valid=valDF['input_text']
    y_valid=valDF['NTEE1']
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################

    def porter_tokenizer(token_list):
        return [PorterStemmer().stem(token) for token in token_list]
    
    # Lemmatize using POS tags, assume to improve accuracy.
    # Ref: 
    #   - https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    #   - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemma_tokenizer(token_list):
        return [WordNetLemmatizer().lemmatize(word=word, pos=get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(token_list)]
            
    if tokenizer=='lemma':
        tokenizer=lemma_tokenizer
    elif tokenizer=='porter':
        tokenizer=porter_tokenizer
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab.
        vectorizer.fit(trainDF['input_text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['input_text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    df_performance = df_performance.append({'trial':str(trial), 
                                            'classifier':str(classifier), 
                                            'tokenizer':tokenizer.__name__, 
                                            'vect_type':vect_type, 
                                            'average_mtd':average_mtd,
                                            'accuracy':metrics.accuracy_score(predictions, y_valid), 
                                            'precision':metrics.precision_score(y_pred=predictions, y_true=y_valid, average=average_mtd),
                                            'recall':metrics.recall_score(y_pred=predictions, y_true=y_valid, average=average_mtd),
                                            'f1':metrics.f1_score(y_pred=predictions, y_true=y_valid, average=average_mtd)
                                           }, ignore_index=True)

### Iterate different configurations at once.

In [None]:
for classifier in [naive_bayes.MultinomialNB(), naive_bayes.ComplementNB()]:
    for tokenizer in ['lemma', 'porter']:
        for vect_type in ['count', 'tfidf']:
            for average_mtd in ['macro', 'weighted']:
                dview['classifier']=classifier
                dview['tokenizer']=tokenizer
                dview['vect_type']=vect_type
                dview['average_mtd']=average_mtd
                t=func_naive_bayes.map(range(30))

In [9]:
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

Unnamed: 0,trial,classifier,tokenizer,vect_type,average_mtd,accuracy,precision,recall,f1
201,12,"ComplementNB(alpha=1.0, class_prior=None, fit_...",lemma_tokenizer,count,weighted,0.240194,0.251291,0.240194,0.195009
31,1,"ComplementNB(alpha=1.0, class_prior=None, fit_...",porter_tokenizer,tfidf,weighted,0.239194,0.237461,0.239194,0.206953
59,3,"ComplementNB(alpha=1.0, class_prior=None, fit_...",lemma_tokenizer,tfidf,weighted,0.234611,0.23562,0.234611,0.20473
87,5,"MultinomialNB(alpha=1.0, class_prior=None, fit...",porter_tokenizer,tfidf,weighted,0.165278,0.052585,0.165278,0.046978
64,4,"MultinomialNB(alpha=1.0, class_prior=None, fit...",lemma_tokenizer,count,macro,0.260389,0.195677,0.167313,0.162198
202,12,"ComplementNB(alpha=1.0, class_prior=None, fit_...",lemma_tokenizer,tfidf,macro,0.235528,0.149415,0.117563,0.105657
30,1,"ComplementNB(alpha=1.0, class_prior=None, fit_...",porter_tokenizer,tfidf,macro,0.238333,0.144291,0.119297,0.107599
218,13,"ComplementNB(alpha=1.0, class_prior=None, fit_...",lemma_tokenizer,tfidf,macro,0.238972,0.168982,0.119402,0.109173
96,6,"MultinomialNB(alpha=1.0, class_prior=None, fit...",lemma_tokenizer,count,macro,0.268694,0.202216,0.172679,0.167414
289,18,"MultinomialNB(alpha=1.0, class_prior=None, fit...",lemma_tokenizer,count,weighted,0.262333,0.26238,0.262333,0.244441


In [10]:
df_performance[df_performance.average_mtd=='macro'].groupby(['classifier', 'tokenizer', 'vect_type']).describe()[['accuracy','f1']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,f1,f1,f1,f1,f1,f1,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
classifier,tokenizer,vect_type,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",lemma_tokenizer,count,30.0,0.241269,0.002266,0.235833,0.240069,0.241569,0.242604,0.244778,30.0,0.093399,0.002126,0.089127,0.092017,0.093258,0.095179,0.097384
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",lemma_tokenizer,tfidf,30.0,0.238564,0.002922,0.232667,0.236396,0.237958,0.240625,0.245556,30.0,0.108314,0.001699,0.104967,0.107377,0.10808,0.109486,0.112052
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",porter_tokenizer,count,30.0,0.240762,0.002503,0.235306,0.239028,0.240431,0.242806,0.245556,30.0,0.092774,0.0023,0.087243,0.091331,0.093029,0.094016,0.098436
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",porter_tokenizer,tfidf,30.0,0.238947,0.002754,0.233333,0.236813,0.238861,0.240194,0.245972,30.0,0.108256,0.001617,0.105023,0.107222,0.108431,0.108919,0.112142
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",lemma_tokenizer,count,30.0,0.263319,0.002883,0.257361,0.261604,0.263056,0.265132,0.269222,30.0,0.16316,0.002632,0.157605,0.161699,0.163116,0.164495,0.167805
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",lemma_tokenizer,tfidf,30.0,0.165811,0.001591,0.163,0.164382,0.165625,0.167153,0.169306,30.0,0.011404,0.000103,0.011213,0.011322,0.0114,0.011481,0.01162
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",porter_tokenizer,count,30.0,0.263534,0.002993,0.25725,0.261354,0.26375,0.265222,0.26925,30.0,0.163105,0.002602,0.158064,0.161268,0.163216,0.164914,0.168329
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",porter_tokenizer,tfidf,30.0,0.166251,0.001728,0.161611,0.165111,0.166361,0.167458,0.168722,30.0,0.011448,0.00012,0.011165,0.011385,0.011459,0.011527,0.011642


### Random forest.

In [None]:
for classifier in [ensemble.RandomForestClassifier()]:
    for tokenizer in ['lemma', 'porter']:
        for vect_type in ['count', 'tfidf']:
            for average_mtd in ['macro', 'weighted']:
                dview['classifier']=classifier
                dview['tokenizer']=tokenizer
                dview['vect_type']=vect_type
                dview['average_mtd']=average_mtd
                t=func_naive_bayes.map(range(30))

In [12]:
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

Unnamed: 0,trial,classifier,tokenizer,vect_type,average_mtd,accuracy,precision,recall,f1
235,9,"RandomForestClassifier(bootstrap=True, class_w...",lemma_tokenizer,tfidf,weighted,0.278,0.279592,0.278,0.255527
344,14,"ComplementNB(alpha=1.0, class_prior=None, fit_...",lemma_tokenizer,count,macro,0.235833,0.190699,0.105259,0.092207
95,3,"RandomForestClassifier(bootstrap=True, class_w...",porter_tokenizer,tfidf,weighted,0.278028,0.280232,0.278028,0.254919
403,16,"RandomForestClassifier(bootstrap=True, class_w...",lemma_tokenizer,tfidf,weighted,0.277972,0.278541,0.277972,0.254734
94,3,"RandomForestClassifier(bootstrap=True, class_w...",porter_tokenizer,tfidf,macro,0.274389,0.266772,0.147704,0.160976
132,5,"ComplementNB(alpha=1.0, class_prior=None, fit_...",porter_tokenizer,count,macro,0.237528,0.182862,0.103287,0.090467
567,23,"ComplementNB(alpha=1.0, class_prior=None, fit_...",porter_tokenizer,tfidf,weighted,0.235889,0.237583,0.235889,0.205232
531,22,"MultinomialNB(alpha=1.0, class_prior=None, fit...",lemma_tokenizer,tfidf,weighted,0.167333,0.043979,0.167333,0.04807
246,10,"MultinomialNB(alpha=1.0, class_prior=None, fit...",porter_tokenizer,tfidf,macro,0.165833,0.006634,0.04,0.01138
215,8,"RandomForestClassifier(bootstrap=True, class_w...",porter_tokenizer,tfidf,weighted,0.281472,0.285078,0.281472,0.259149


In [15]:
df_performance[df_performance.average_mtd=='macro'].groupby(['classifier', 'tokenizer', 'vect_type']).describe()[['accuracy','f1']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,f1,f1,f1,f1,f1,f1,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
classifier,tokenizer,vect_type,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",lemma_tokenizer,count,30.0,0.241269,0.002266,0.235833,0.240069,0.241569,0.242604,0.244778,30.0,0.093399,0.002126,0.089127,0.092017,0.093258,0.095179,0.097384
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",lemma_tokenizer,tfidf,30.0,0.238564,0.002922,0.232667,0.236396,0.237958,0.240625,0.245556,30.0,0.108314,0.001699,0.104967,0.107377,0.10808,0.109486,0.112052
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",porter_tokenizer,count,30.0,0.240762,0.002503,0.235306,0.239028,0.240431,0.242806,0.245556,30.0,0.092774,0.0023,0.087243,0.091331,0.093029,0.094016,0.098436
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",porter_tokenizer,tfidf,30.0,0.238947,0.002754,0.233333,0.236813,0.238861,0.240194,0.245972,30.0,0.108256,0.001617,0.105023,0.107222,0.108431,0.108919,0.112142
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",lemma_tokenizer,count,30.0,0.263319,0.002883,0.257361,0.261604,0.263056,0.265132,0.269222,30.0,0.16316,0.002632,0.157605,0.161699,0.163116,0.164495,0.167805
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",lemma_tokenizer,tfidf,30.0,0.165811,0.001591,0.163,0.164382,0.165625,0.167153,0.169306,30.0,0.011404,0.000103,0.011213,0.011322,0.0114,0.011481,0.01162
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",porter_tokenizer,count,30.0,0.263534,0.002993,0.25725,0.261354,0.26375,0.265222,0.26925,30.0,0.163105,0.002602,0.158064,0.161268,0.163216,0.164914,0.168329
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",porter_tokenizer,tfidf,30.0,0.166251,0.001728,0.161611,0.165111,0.166361,0.167458,0.168722,30.0,0.011448,0.00012,0.011165,0.011385,0.011459,0.011527,0.011642
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n max_depth=None, max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n oob_score=False, random_state=None, verbose=0,\n warm_start=False)",lemma_tokenizer,count,30.0,0.257443,0.002952,0.251972,0.255514,0.257264,0.258389,0.26525,30.0,0.154982,0.003609,0.14557,0.153302,0.154772,0.158298,0.160671
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n max_depth=None, max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n oob_score=False, random_state=None, verbose=0,\n warm_start=False)",lemma_tokenizer,tfidf,30.0,0.279081,0.002217,0.274444,0.277569,0.279111,0.280854,0.282556,30.0,0.16684,0.00243,0.16255,0.165637,0.166418,0.168261,0.172533


## Draft.

### Try Mission Statements - MultinomialNB - LemmaTokenizer - TFIDF.

In [29]:
@dview.parallel(block=True)
def func_mission_MNB_lemma_tfidf(trial):
    global df_train, df_performance, txt_field, classifier, tokenizer, vect_type, average_mtd
    
    ##########################################################
    ####### Set environments for different functions #########
    txt_field='mission' # 'mission', 'prgrm_dsc', 'mission_prgrm'
    classifier=naive_bayes.MultinomialNB()
    tokenizer='lemma' # 'lemma', 'stemming'
    vect_type='tfidf' # 'count', 'tfidf'
    average_mtd='macro' # Use unweighted mean.
    ####### Set environments for different functions #########
    ##########################################################
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF[txt_field].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
            
    if tokenizer=='lemma':
        tokenizer=LemmaTokenizer()
    elif tokenizer=='stemming':
        tokenizer=stemming_tokenizer()
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    df_performance = df_performance.append({'trial':str(trial), 
                                            'txt_field':txt_field, 
                                            'classifier':str(classifier), 
                                            'tokenizer':type(tokenizer).__name__, 
                                            'vect_type':vect_type, 
                                            'average_mtd':average_mtd,
                                            'accuracy':metrics.accuracy_score(predictions, y_valid), 
                                            'precision':metrics.precision_score(predictions, y_valid, average=average_mtd),
                                            'recall':metrics.recall_score(predictions, y_valid, average=average_mtd),
                                           }, ignore_index=True)

In [30]:
t=func_mission_MNB_lemma_tfidf.map(range(100))
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

Unnamed: 0,trial,txt_field,classifier,tokenizer,vect_type,average_mtd,accuracy,precision,recall
681,97,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,tfidf,macro,0.673533,0.479451,0.639056
353,51,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.5953,0.353232,0.595305
595,85,mission,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.703933,0.533654,0.635387
561,81,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.662233,0.426434,0.689126
240,34,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.5987,0.355358,0.670974
347,49,mission,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,tfidf,macro,0.700633,0.540676,0.63322
662,94,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.656733,0.475081,0.608428
79,10,mission,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,tfidf,macro,0.699167,0.529788,0.612299
335,47,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,tfidf,macro,0.5617,0.30022,0.537231
269,39,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6099,0.363073,0.620524


### Try Mission Statements - MultinomialNB - LemmaTokenizer - Count.

In [5]:
@dview.parallel(block=True)
def func_mission_MNB_lemma_count(trial):
    global df_train, df_performance
    
    ##########################################################
    ####### Set environments for different functions #########
    txt_field='mission' # 'mission', 'prgrm_dsc', 'mission_prgrm'
    classifier=naive_bayes.MultinomialNB()
    tokenizer='lemma' # 'lemma', 'stemming'
    vect_type='count' # 'count', 'tfidf'
    average_mtd='macro' # Use unweighted mean.
    ####### Set environments for different functions #########
    ##########################################################
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF[txt_field].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
            
    if tokenizer=='lemma':
        tokenizer=LemmaTokenizer()
    elif tokenizer=='stemming':
        tokenizer=stemming_tokenizer()
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    df_performance = df_performance.append({'trial':str(trial), 
                                            'txt_field':txt_field, 
                                            'classifier':str(classifier), 
                                            'tokenizer':type(tokenizer).__name__, 
                                            'vect_type':vect_type, 
                                            'average_mtd':average_mtd,
                                            'accuracy':metrics.accuracy_score(predictions, y_valid), 
                                            'precision':metrics.precision_score(predictions, y_valid, average=average_mtd),
                                            'recall':metrics.recall_score(predictions, y_valid, average=average_mtd),
                                           }, ignore_index=True)

In [6]:
t=func_mission_MNB_lemma_count.map(range(100))
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

### Try Program Description - MultinomialNB - LemmaTokenizer - Count.

In [12]:
@dview.parallel(block=True)
def func_prgrm_MNB_lemma_count(trial):
    global df_train, df_performance
    
    ##########################################################
    ####### Set environments for different functions #########
    txt_field='prgrm_dsc' # 'mission', 'prgrm_dsc', 'mission_prgrm'
    classifier=naive_bayes.MultinomialNB()
    tokenizer='lemma' # 'lemma', 'stemming'
    vect_type='count' # 'count', 'tfidf'
    average_mtd='macro' # Use unweighted mean.
    ####### Set environments for different functions #########
    ##########################################################
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF[txt_field].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
            
    if tokenizer=='lemma':
        tokenizer=LemmaTokenizer()
    elif tokenizer=='stemming':
        tokenizer=stemming_tokenizer()
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    df_performance = df_performance.append({'trial':str(trial), 
                                            'txt_field':txt_field, 
                                            'classifier':str(classifier), 
                                            'tokenizer':type(tokenizer).__name__, 
                                            'vect_type':vect_type, 
                                            'average_mtd':average_mtd,
                                            'accuracy':metrics.accuracy_score(predictions, y_valid), 
                                            'precision':metrics.precision_score(predictions, y_valid, average=average_mtd),
                                            'recall':metrics.recall_score(predictions, y_valid, average=average_mtd),
                                           }, ignore_index=True)

In [14]:
t=func_prgrm_MNB_lemma_count.map(range(100))
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

Unnamed: 0,trial,txt_field,classifier,tokenizer,vect_type,average_mtd,accuracy,precision,recall
4,1,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.598833,0.349254,0.579754
13,7,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.658333,0.422397,0.644755
52,26,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.656933,0.420701,0.683195
176,88,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6626,0.426009,0.652042
56,28,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.667333,0.431431,0.671606
167,83,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.600867,0.357212,0.680094
11,5,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6046,0.35552,0.627683
194,96,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.603167,0.353313,0.623195
28,14,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.660667,0.42678,0.647043
188,94,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6623,0.423771,0.637518


### Try Mission Statements - ComplementNB - LemmaTokenizer - Count.

In [18]:
@dview.parallel(block=True)
def func_mission_CNB_lemma_count(trial):
    global df_train, df_performance
    
    ##########################################################
    ####### Set environments for different functions #########
    txt_field='mission' # 'mission', 'prgrm_dsc', 'mission_prgrm'
    classifier=naive_bayes.ComplementNB()
    tokenizer='lemma' # 'lemma', 'stemming'
    vect_type='count' # 'count', 'tfidf'
    average_mtd='macro' # Use unweighted mean.
    ####### Set environments for different functions #########
    ##########################################################
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF[txt_field].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
            
    if tokenizer=='lemma':
        tokenizer=LemmaTokenizer()
    elif tokenizer=='stemming':
        tokenizer=stemming_tokenizer()
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    df_performance = df_performance.append({'trial':str(trial), 
                                            'txt_field':txt_field, 
                                            'classifier':str(classifier), 
                                            'tokenizer':type(tokenizer).__name__, 
                                            'vect_type':vect_type, 
                                            'average_mtd':average_mtd,
                                            'accuracy':metrics.accuracy_score(predictions, y_valid), 
                                            'precision':metrics.precision_score(predictions, y_valid, average=average_mtd),
                                            'recall':metrics.recall_score(predictions, y_valid, average=average_mtd),
                                           }, ignore_index=True)

In [19]:
t=func_mission_CNB_lemma_count.map(range(100))
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

Unnamed: 0,trial,txt_field,classifier,tokenizer,vect_type,average_mtd,accuracy,precision,recall
115,29,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.606267,0.356488,0.618608
209,53,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6636,0.426699,0.640428
146,36,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.606667,0.353021,0.649588
126,30,mission,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.699667,0.534671,0.630701
155,39,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6099,0.363073,0.620524
203,51,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.5953,0.353232,0.595305
215,53,mission,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.7004,0.534318,0.625981
15,3,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6013,0.353904,0.650824
17,5,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6046,0.35552,0.627683
240,60,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6632,0.428999,0.67882


### Try Program Description - ComplementNB - LemmaTokenizer - Count.

In [16]:
@dview.parallel(block=True)
def func_prgrm_CNB_lemma_count(trial):
    global df_train, df_performance
    
    ##########################################################
    ####### Set environments for different functions #########
    txt_field='prgrm_dsc' # 'mission', 'prgrm_dsc', 'mission_prgrm'
    classifier=naive_bayes.ComplementNB()
    tokenizer='lemma' # 'lemma', 'stemming'
    vect_type='count' # 'count', 'tfidf'
    average_mtd='macro' # Use unweighted mean.
    ####### Set environments for different functions #########
    ##########################################################
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF[txt_field].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
            
    if tokenizer=='lemma':
        tokenizer=LemmaTokenizer()
    elif tokenizer=='stemming':
        tokenizer=stemming_tokenizer()
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    df_performance = df_performance.append({'trial':str(trial), 
                                            'txt_field':txt_field, 
                                            'classifier':str(classifier), 
                                            'tokenizer':type(tokenizer).__name__, 
                                            'vect_type':vect_type, 
                                            'average_mtd':average_mtd,
                                            'accuracy':metrics.accuracy_score(predictions, y_valid), 
                                            'precision':metrics.precision_score(predictions, y_valid, average=average_mtd),
                                            'recall':metrics.recall_score(predictions, y_valid, average=average_mtd),
                                           }, ignore_index=True)

In [17]:
t=func_prgrm_CNB_lemma_count.map(range(100))
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

Unnamed: 0,trial,txt_field,classifier,tokenizer,vect_type,average_mtd,accuracy,precision,recall
3,0,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.601267,0.351739,0.625842
201,67,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.598733,0.353515,0.62518
289,97,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.657133,0.422356,0.644037
284,94,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.598733,0.352534,0.630569
109,37,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.661867,0.428198,0.653606
248,82,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.600867,0.351543,0.663272
240,80,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.659667,0.426818,0.636164
190,62,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.6574,0.47827,0.614129
106,34,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.655733,0.479513,0.613932
204,68,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.659033,0.424482,0.680102


### Try Program Description - ComplementNB - LemmaTokenizer - TFIDF.

In [23]:
@dview.parallel(block=True)
def func_prgrm_CNB_lemma_tfidf(trial):
    global df_train, df_performance
    
    ##########################################################
    ####### Set environments for different functions #########
    txt_field='prgrm_dsc' # 'mission', 'prgrm_dsc', 'mission_prgrm'
    classifier=naive_bayes.ComplementNB()
    tokenizer='lemma' # 'lemma', 'stemming'
    vect_type='tfidf' # 'count', 'tfidf'
    average_mtd='macro' # Use unweighted mean.
    ####### Set environments for different functions #########
    ##########################################################
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF[txt_field].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
            
    if tokenizer=='lemma':
        tokenizer=LemmaTokenizer()
    elif tokenizer=='stemming':
        tokenizer=stemming_tokenizer()
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    df_performance = df_performance.append({'trial':str(trial), 
                                            'txt_field':txt_field, 
                                            'classifier':str(classifier), 
                                            'tokenizer':type(tokenizer).__name__, 
                                            'vect_type':vect_type, 
                                            'average_mtd':average_mtd,
                                            'accuracy':metrics.accuracy_score(predictions, y_valid), 
                                            'precision':metrics.precision_score(predictions, y_valid, average=average_mtd),
                                            'recall':metrics.recall_score(predictions, y_valid, average=average_mtd),
                                           }, ignore_index=True)

In [24]:
t=func_prgrm_CNB_lemma_tfidf.map(range(100))
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

Unnamed: 0,trial,txt_field,classifier,tokenizer,vect_type,average_mtd,accuracy,precision,recall
96,18,mission,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.7004,0.528067,0.616052
251,51,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.661267,0.426321,0.683731
236,46,mission,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.7012,0.531807,0.626519
284,56,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.653367,0.477555,0.613562
234,46,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.655467,0.474524,0.621911
405,81,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.6609,0.484696,0.616098
365,73,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.656233,0.477075,0.610757
461,93,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6576,0.423429,0.641899
47,11,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.661233,0.424397,0.657256
18,3,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6013,0.353904,0.650824


### Try Mission Statements - ComplementNB - LemmaTokenizer - TFIDF.

In [25]:
@dview.parallel(block=True)
def func_mission_CNB_lemma_tfidf(trial):
    global df_train, df_performance
    
    ##########################################################
    ####### Set environments for different functions #########
    txt_field='mission' # 'mission', 'prgrm_dsc', 'mission_prgrm'
    classifier=naive_bayes.ComplementNB()
    tokenizer='lemma' # 'lemma', 'stemming'
    vect_type='tfidf' # 'count', 'tfidf'
    average_mtd='macro' # Use unweighted mean.
    ####### Set environments for different functions #########
    ##########################################################
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF[txt_field].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
            
    if tokenizer=='lemma':
        tokenizer=LemmaTokenizer()
    elif tokenizer=='stemming':
        tokenizer=stemming_tokenizer()
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    df_performance = df_performance.append({'trial':str(trial), 
                                            'txt_field':txt_field, 
                                            'classifier':str(classifier), 
                                            'tokenizer':type(tokenizer).__name__, 
                                            'vect_type':vect_type, 
                                            'average_mtd':average_mtd,
                                            'accuracy':metrics.accuracy_score(predictions, y_valid), 
                                            'precision':metrics.precision_score(predictions, y_valid, average=average_mtd),
                                            'recall':metrics.recall_score(predictions, y_valid, average=average_mtd),
                                           }, ignore_index=True)

In [26]:
t=func_mission_CNB_lemma_tfidf.map(range(100))
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

Unnamed: 0,trial,txt_field,classifier,tokenizer,vect_type,average_mtd,accuracy,precision,recall
248,40,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,tfidf,macro,0.673467,0.481593,0.663747
411,69,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.603033,0.353719,0.629368
51,6,mission,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,tfidf,macro,0.693267,0.533007,0.63113
219,37,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.608833,0.358827,0.599976
373,63,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6651,0.430047,0.652673
530,88,prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.6048,0.353762,0.596581
96,16,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.659567,0.425299,0.648161
377,63,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.655667,0.47479,0.606161
193,33,mission,"MultinomialNB(alpha=1.0, class_prior=None, fit...",LemmaTokenizer,count,macro,0.657933,0.423631,0.641981
88,14,prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_...",LemmaTokenizer,count,macro,0.662267,0.481588,0.611395


In [28]:
df_performance.groupby(['txt_field', 'classifier', 'vect_type']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,precision,precision,precision,precision,precision,recall,recall,recall,recall,recall,recall,recall,recall
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
txt_field,classifier,vect_type,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
mission,"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",count,100.0,0.699907,0.002527,0.6939,0.698317,0.7001,0.701692,0.706467,100.0,0.531594,...,0.533821,0.54132,100.0,0.627921,0.008283,0.608945,0.622043,0.627389,0.632986,0.647726
mission,"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",tfidf,100.0,0.697841,0.0025,0.690767,0.6959,0.698233,0.699642,0.703133,100.0,0.533794,...,0.536262,0.541629,100.0,0.622604,0.007315,0.60093,0.617881,0.622611,0.626997,0.646784
mission,"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",count,100.0,0.661342,0.002978,0.6517,0.659417,0.66155,0.663217,0.668867,100.0,0.42575,...,0.428148,0.433099,100.0,0.659809,0.021351,0.622731,0.643071,0.650996,0.682431,0.703958
prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",count,100.0,0.657213,0.002682,0.650667,0.655233,0.657233,0.659,0.663833,100.0,0.476934,...,0.479399,0.485138,100.0,0.614657,0.00815,0.5911,0.609157,0.614943,0.619995,0.641197
prgrm_dsc,"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",tfidf,100.0,0.673393,0.002592,0.667533,0.671317,0.673467,0.675242,0.680067,100.0,0.479188,...,0.481655,0.485097,100.0,0.64429,0.012344,0.617138,0.63586,0.644252,0.650899,0.68413
prgrm_dsc,"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",count,100.0,0.602443,0.002772,0.5953,0.600658,0.60235,0.60415,0.6099,100.0,0.355167,...,0.356803,0.364457,100.0,0.627163,0.028436,0.577607,0.609106,0.625511,0.647579,0.706715


In [5]:
df_mission_MNB_lemma_count.describe()

Unnamed: 0,accuracy,precision,recall
count,100.0,100.0,100.0
mean,0.66106,0.425503,0.658984
std,0.002628,0.002539,0.020294
min,0.655533,0.419339,0.627483
25%,0.659017,0.423844,0.64408
50%,0.66135,0.42536,0.650408
75%,0.662775,0.427471,0.677598
max,0.668267,0.432884,0.705346


### Try Mission Statements - ComplementNB - LemmaTokenizer - Count.

In [None]:
df_mission_CNB_lemma_count=pd.DataFrame(columns=['accuracy', 'precision', 'recall'])
for trial in tqdm(range(0, 100)):
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF['mission'].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
    ################ Define tokenizer ################
    ##########################################################

    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.

    ##### Token counts #####
    # create the transform
    count_vect = CountVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    count_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_count =  count_vect.transform(x_train)
    x_valid_vect_count =  count_vect.transform(x_valid)
    ##### Token counts #####

    ##### TF-IDF #####
    # create the transform
    tfidf_vect = TfidfVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    tfidf_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_tfidf =  tfidf_vect.transform(x_train)
    x_valid_vect_tfidf =  tfidf_vect.transform(x_valid)
    ##### TF-IDF #####

    ######### Text Vectorization and Transformation ##########
    ##########################################################

    def func_performance(classifier, x_train, y_train, x_valid, y_valid):
        # fit the training dataset on the classifier
        classifier.fit(x_train, y_train)
        # predict the labels on validation dataset
        predictions = classifier.predict(x_valid)
        return [metrics.accuracy_score(predictions, y_valid), 
                metrics.precision_score(predictions, y_valid, 
                                        average='macro', # Use unweighted mean.
                                       ),
                metrics.recall_score(predictions, y_valid, 
                                     average='macro',  # Use unweighted mean.
                                    )]

    performance_result=func_performance(classifier=naive_bayes.ComplementNB(), 
                                        x_train=x_train_vect_count,
                                        y_train= y_train, 
                                        x_valid=x_valid_vect_count,
                                        y_valid=y_valid
                                       )
    df_mission_CNB_lemma_count.loc[trial]=performance_result

 85%|████████▌ | 85/100 [3:10:23<33:50, 135.39s/it]

In [None]:
df_mission_CNB_lemma_count.describe()

### Try Program Description - MultinomialNB - LemmaTokenizer - Count.

In [7]:
df_prgrm_MNB_lemma_count=pd.DataFrame(columns=['accuracy', 'precision', 'recall'])
for trial in tqdm(range(0, 100)):
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF['prgrm_dsc'].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
    ################ Define tokenizer ################
    ##########################################################

    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.

    ##### Token counts #####
    # create the transform
    count_vect = CountVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    count_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_count =  count_vect.transform(x_train)
    x_valid_vect_count =  count_vect.transform(x_valid)
    ##### Token counts #####

    ##### TF-IDF #####
    # create the transform
    tfidf_vect = TfidfVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    tfidf_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_tfidf =  tfidf_vect.transform(x_train)
    x_valid_vect_tfidf =  tfidf_vect.transform(x_valid)
    ##### TF-IDF #####

    ######### Text Vectorization and Transformation ##########
    ##########################################################

    def func_performance(classifier, x_train, y_train, x_valid, y_valid):
        # fit the training dataset on the classifier
        classifier.fit(x_train, y_train)
        # predict the labels on validation dataset
        predictions = classifier.predict(x_valid)
        return [metrics.accuracy_score(predictions, y_valid), 
                metrics.precision_score(predictions, y_valid, 
                                        average='macro', # Use unweighted mean.
                                       ),
                metrics.recall_score(predictions, y_valid, 
                                     average='macro',  # Use unweighted mean.
                                    )]

    performance_result=func_performance(classifier=naive_bayes.MultinomialNB(), 
                                        x_train=x_train_vect_count,
                                        y_train= y_train, 
                                        x_valid=x_valid_vect_count,
                                        y_valid=y_valid
                                       )
    df_prgrm_MNB_lemma_count.loc[trial]=performance_result

100%|██████████| 100/100 [11:35:52<00:00, 419.77s/it]


In [None]:
df_prgrm_MNB_lemma_count.describe()

### Try Program Description - ComplementNB - LemmaTokenizer - Count.

In [None]:
df_prgrm_CNB_lemma_count=pd.DataFrame(columns=['accuracy', 'precision', 'recall'])
for trial in tqdm(range(0, 100)):
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF['prgrm_dsc'].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
    ################ Define tokenizer ################
    ##########################################################

    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.

    ##### Token counts #####
    # create the transform
    count_vect = CountVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    count_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_count =  count_vect.transform(x_train)
    x_valid_vect_count =  count_vect.transform(x_valid)
    ##### Token counts #####

    ##### TF-IDF #####
    # create the transform
    tfidf_vect = TfidfVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    tfidf_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_tfidf =  tfidf_vect.transform(x_train)
    x_valid_vect_tfidf =  tfidf_vect.transform(x_valid)
    ##### TF-IDF #####

    ######### Text Vectorization and Transformation ##########
    ##########################################################

    def func_performance(classifier, x_train, y_train, x_valid, y_valid):
        # fit the training dataset on the classifier
        classifier.fit(x_train, y_train)
        # predict the labels on validation dataset
        predictions = classifier.predict(x_valid)
        return [metrics.accuracy_score(predictions, y_valid), 
                metrics.precision_score(predictions, y_valid, 
                                        average='macro', # Use unweighted mean.
                                       ),
                metrics.recall_score(predictions, y_valid, 
                                     average='macro',  # Use unweighted mean.
                                    )]

    performance_result=func_performance(classifier=naive_bayes.ComplementNB(), 
                                        x_train=x_train_vect_count,
                                        y_train= y_train, 
                                        x_valid=x_valid_vect_count,
                                        y_valid=y_valid
                                       )
    df_prgrm_CNB_lemma_count.loc[trial]=performance_result

In [None]:
df_prgrm_CNB_lemma_count.describe()

In [27]:
# Naive Bayes on Word Level TF IDF Vectors
# Naive Bayes on Count Vectors
accuracy = train_model(classifier=naive_bayes.MultinomialNB(), 
                       x_train=x_train_vect_tfidf,
                       y_train= y_train, 
                       x_valid=x_valid_vect_tfidf,
                       y_valid=y_valid
                      )
results.loc[len(results)] = ["NB, Count Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, TF IDF Vectors: ", accuracy)


# Raw: MNB, Count Vectors:  [0.6732666666666667, 0.4503852200520778, 0.6661678975327818, datetime.timedelta(microseconds=489443)]
# Raw: CNB, Count Vectors:  [0.7013333333333334, 0.5275971009306734, 0.6170202787685387, datetime.timedelta(microseconds=521657)]
# Stemmed: MNB, Count Vectors:  [0.6601, 0.4243013182900721, 0.639930088219624, datetime.timedelta(microseconds=443571)]
# Stemmed: CNB, Count Vectors:  [0.7, 0.5320034132229355, 0.6383184621220114, datetime.timedelta(microseconds=504788)]
# Lemma: MNB, Count Vectors:  [0.6615, 0.42366552445612, 0.6783346552845068, datetime.timedelta(microseconds=572416)]
# Lemma: CNB, Count Vectors:  [0.7021333333333334, 0.5341465196920103, 0.6318440519847688, datetime.timedelta(microseconds=586597)]

# Raw: MNB, TF IDF Vectors:  [0.5859, 0.32327746269021723, 0.5485416630089535, datetime.timedelta(microseconds=536424)]
# Raw: CNB, TF IDF Vectors:  [0.6992333333333334, 0.5306246321804787, 0.6136847132057796, datetime.timedelta(microseconds=580513)]
# Stemmed: MNB, TF IDF Vectors:  [0.553, 0.2949552239378839, 0.536824161513825, datetime.timedelta(microseconds=453431)]
# Stemmed: CNB, TF IDF Vectors:  [0.6967666666666666, 0.5349583083797411, 0.6223488506468897, datetime.timedelta(microseconds=520307)]
# Lemma: MNB, TF IDF Vectors:  [0.5589666666666666, 0.297005157029138, 0.5293106710625075, datetime.timedelta(microseconds=529497)]
# Lemma: CNB, TF IDF Vectors:  [0.7012333333333334, 0.536867845262306, 0.6273414839488708, datetime.timedelta(microseconds=546401)]

NB, TF IDF Vectors:  [0.5583666666666667, 0.29982304712090974, 0.5961215483457072, datetime.timedelta(microseconds=498992)]


  'recall', 'true', average, warn_for)


**Looks like `Lemma-CNB-Count` produces best results.**