In [1]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Test reading file.
import os
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
len(df_train)

229472

### Prepare parrallel envionment.

In [None]:
import ipyparallel as ipp
c = ipp.Client()
print(c.ids)
dview = c[:]

In [None]:
dview.execute('from sklearn import model_selection, preprocessing, naive_bayes, metrics')
dview.execute('from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer')
dview.execute('from sklearn import decomposition, ensemble')
dview.execute('from nltk.stem import PorterStemmer')
dview.execute('from nltk import word_tokenize')
dview.execute('from nltk.stem import WordNetLemmatizer')
dview.execute('import pandas as pd')
dview['df_train']=df_train
dview['df_performance']=pd.DataFrame(columns=['trial', 'txt_field', 'classifier', 'tokenizer', 'vect_type', 'average_mtd', 'accuracy', 'precision', 'recall'])

### Try Mission Statements - MultinomialNB - LemmaTokenizer - Count.

In [80]:
@dview.parallel(block=True)
def func_mission_MNB_lemma_count(trial):
    global df_train, df_performance
    
    ##########################################################
    ####### Set environments for different functions #########
    txt_field='mission' # 'mission', 'prgrm_dsc', 'mission_prgrm'
    classifier=naive_bayes.MultinomialNB()
    tokenizer='lemma' # 'lemma', 'stemming'
    vect_type='count' # 'count', 'tfidf'
    average_mtd='macro' # Use unweighted mean.
    ####### Set environments for different functions #########
    ##########################################################
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF[txt_field].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
            
    if tokenizer=='lemma':
        tokenizer=LemmaTokenizer()
    elif tokenizer=='stemming':
        tokenizer=stemming_tokenizer()
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(trainDF['text'])
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    df_performance = df_performance.append({'trial':str(trial), 
                                            'txt_field':txt_field, 
                                            'classifier':str(classifier), 
                                            'tokenizer':type(tokenizer).__name__, 
                                            'vect_type':vect_type, 
                                            'average_mtd':average_mtd,
                                            'accuracy':metrics.accuracy_score(predictions, y_valid), 
                                            'precision':metrics.precision_score(predictions, y_valid, average=average_mtd),
                                            'recall':metrics.recall_score(predictions, y_valid, average=average_mtd),
                                           }, ignore_index=True)

In [81]:
func_mission_MNB_lemma_count(3)

In [None]:
df_performance

In [76]:
df_performance.append({'vect_type':'test'}, ignore_index=True, inplace=True)

TypeError: append() got an unexpected keyword argument 'inplace'

In [73]:
pd.DataFrame({'vect_type':'test'})

ValueError: If using all scalar values, you must pass an index

In [5]:
df_mission_MNB_lemma_count.describe()

Unnamed: 0,accuracy,precision,recall
count,100.0,100.0,100.0
mean,0.66106,0.425503,0.658984
std,0.002628,0.002539,0.020294
min,0.655533,0.419339,0.627483
25%,0.659017,0.423844,0.64408
50%,0.66135,0.42536,0.650408
75%,0.662775,0.427471,0.677598
max,0.668267,0.432884,0.705346


### Try Mission Statements - ComplementNB - LemmaTokenizer - Count.

In [None]:
df_mission_CNB_lemma_count=pd.DataFrame(columns=['accuracy', 'precision', 'recall'])
for trial in tqdm(range(0, 100)):
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF['mission'].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
    ################ Define tokenizer ################
    ##########################################################

    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.

    ##### Token counts #####
    # create the transform
    count_vect = CountVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    count_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_count =  count_vect.transform(x_train)
    x_valid_vect_count =  count_vect.transform(x_valid)
    ##### Token counts #####

    ##### TF-IDF #####
    # create the transform
    tfidf_vect = TfidfVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    tfidf_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_tfidf =  tfidf_vect.transform(x_train)
    x_valid_vect_tfidf =  tfidf_vect.transform(x_valid)
    ##### TF-IDF #####

    ######### Text Vectorization and Transformation ##########
    ##########################################################

    def func_performance(classifier, x_train, y_train, x_valid, y_valid):
        # fit the training dataset on the classifier
        classifier.fit(x_train, y_train)
        # predict the labels on validation dataset
        predictions = classifier.predict(x_valid)
        return [metrics.accuracy_score(predictions, y_valid), 
                metrics.precision_score(predictions, y_valid, 
                                        average='macro', # Use unweighted mean.
                                       ),
                metrics.recall_score(predictions, y_valid, 
                                     average='macro',  # Use unweighted mean.
                                    )]

    performance_result=func_performance(classifier=naive_bayes.ComplementNB(), 
                                        x_train=x_train_vect_count,
                                        y_train= y_train, 
                                        x_valid=x_valid_vect_count,
                                        y_valid=y_valid
                                       )
    df_mission_CNB_lemma_count.loc[trial]=performance_result

 85%|████████▌ | 85/100 [3:10:23<33:50, 135.39s/it]

In [None]:
df_mission_CNB_lemma_count.describe()

### Try Program Description - MultinomialNB - LemmaTokenizer - Count.

In [7]:
df_prgrm_MNB_lemma_count=pd.DataFrame(columns=['accuracy', 'precision', 'recall'])
for trial in tqdm(range(0, 100)):
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF['prgrm_dsc'].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
    ################ Define tokenizer ################
    ##########################################################

    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.

    ##### Token counts #####
    # create the transform
    count_vect = CountVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    count_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_count =  count_vect.transform(x_train)
    x_valid_vect_count =  count_vect.transform(x_valid)
    ##### Token counts #####

    ##### TF-IDF #####
    # create the transform
    tfidf_vect = TfidfVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    tfidf_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_tfidf =  tfidf_vect.transform(x_train)
    x_valid_vect_tfidf =  tfidf_vect.transform(x_valid)
    ##### TF-IDF #####

    ######### Text Vectorization and Transformation ##########
    ##########################################################

    def func_performance(classifier, x_train, y_train, x_valid, y_valid):
        # fit the training dataset on the classifier
        classifier.fit(x_train, y_train)
        # predict the labels on validation dataset
        predictions = classifier.predict(x_valid)
        return [metrics.accuracy_score(predictions, y_valid), 
                metrics.precision_score(predictions, y_valid, 
                                        average='macro', # Use unweighted mean.
                                       ),
                metrics.recall_score(predictions, y_valid, 
                                     average='macro',  # Use unweighted mean.
                                    )]

    performance_result=func_performance(classifier=naive_bayes.MultinomialNB(), 
                                        x_train=x_train_vect_count,
                                        y_train= y_train, 
                                        x_valid=x_valid_vect_count,
                                        y_valid=y_valid
                                       )
    df_prgrm_MNB_lemma_count.loc[trial]=performance_result

100%|██████████| 100/100 [11:35:52<00:00, 419.77s/it]


In [None]:
df_prgrm_MNB_lemma_count.describe()

### Try Program Description - ComplementNB - LemmaTokenizer - Count.

In [None]:
df_prgrm_CNB_lemma_count=pd.DataFrame(columns=['accuracy', 'precision', 'recall'])
for trial in tqdm(range(0, 100)):
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF['prgrm_dsc'].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
    ################ Define tokenizer ################
    ##########################################################

    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.

    ##### Token counts #####
    # create the transform
    count_vect = CountVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    count_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_count =  count_vect.transform(x_train)
    x_valid_vect_count =  count_vect.transform(x_valid)
    ##### Token counts #####

    ##### TF-IDF #####
    # create the transform
    tfidf_vect = TfidfVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    tfidf_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_tfidf =  tfidf_vect.transform(x_train)
    x_valid_vect_tfidf =  tfidf_vect.transform(x_valid)
    ##### TF-IDF #####

    ######### Text Vectorization and Transformation ##########
    ##########################################################

    def func_performance(classifier, x_train, y_train, x_valid, y_valid):
        # fit the training dataset on the classifier
        classifier.fit(x_train, y_train)
        # predict the labels on validation dataset
        predictions = classifier.predict(x_valid)
        return [metrics.accuracy_score(predictions, y_valid), 
                metrics.precision_score(predictions, y_valid, 
                                        average='macro', # Use unweighted mean.
                                       ),
                metrics.recall_score(predictions, y_valid, 
                                     average='macro',  # Use unweighted mean.
                                    )]

    performance_result=func_performance(classifier=naive_bayes.ComplementNB(), 
                                        x_train=x_train_vect_count,
                                        y_train= y_train, 
                                        x_valid=x_valid_vect_count,
                                        y_valid=y_valid
                                       )
    df_prgrm_CNB_lemma_count.loc[trial]=performance_result

In [None]:
df_prgrm_CNB_lemma_count.describe()

In [27]:
# Naive Bayes on Word Level TF IDF Vectors
# Naive Bayes on Count Vectors
accuracy = train_model(classifier=naive_bayes.MultinomialNB(), 
                       x_train=x_train_vect_tfidf,
                       y_train= y_train, 
                       x_valid=x_valid_vect_tfidf,
                       y_valid=y_valid
                      )
results.loc[len(results)] = ["NB, Count Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, TF IDF Vectors: ", accuracy)


# Raw: MNB, Count Vectors:  [0.6732666666666667, 0.4503852200520778, 0.6661678975327818, datetime.timedelta(microseconds=489443)]
# Raw: CNB, Count Vectors:  [0.7013333333333334, 0.5275971009306734, 0.6170202787685387, datetime.timedelta(microseconds=521657)]
# Stemmed: MNB, Count Vectors:  [0.6601, 0.4243013182900721, 0.639930088219624, datetime.timedelta(microseconds=443571)]
# Stemmed: CNB, Count Vectors:  [0.7, 0.5320034132229355, 0.6383184621220114, datetime.timedelta(microseconds=504788)]
# Lemma: MNB, Count Vectors:  [0.6615, 0.42366552445612, 0.6783346552845068, datetime.timedelta(microseconds=572416)]
# Lemma: CNB, Count Vectors:  [0.7021333333333334, 0.5341465196920103, 0.6318440519847688, datetime.timedelta(microseconds=586597)]

# Raw: MNB, TF IDF Vectors:  [0.5859, 0.32327746269021723, 0.5485416630089535, datetime.timedelta(microseconds=536424)]
# Raw: CNB, TF IDF Vectors:  [0.6992333333333334, 0.5306246321804787, 0.6136847132057796, datetime.timedelta(microseconds=580513)]
# Stemmed: MNB, TF IDF Vectors:  [0.553, 0.2949552239378839, 0.536824161513825, datetime.timedelta(microseconds=453431)]
# Stemmed: CNB, TF IDF Vectors:  [0.6967666666666666, 0.5349583083797411, 0.6223488506468897, datetime.timedelta(microseconds=520307)]
# Lemma: MNB, TF IDF Vectors:  [0.5589666666666666, 0.297005157029138, 0.5293106710625075, datetime.timedelta(microseconds=529497)]
# Lemma: CNB, TF IDF Vectors:  [0.7012333333333334, 0.536867845262306, 0.6273414839488708, datetime.timedelta(microseconds=546401)]

NB, TF IDF Vectors:  [0.5583666666666667, 0.29982304712090974, 0.5961215483457072, datetime.timedelta(microseconds=498992)]


  'recall', 'true', average, warn_for)


**Looks like `Lemma-CNB-Count` produces best results.**