In [33]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from tensorflow import keras
from sklearn import preprocessing
import keras.backend as K

#to-do list
#1. Record the amount of time a classifier takes: start(timestamp) - end(timestamp), put it in .pkl.gz

#Observations:
# 1. RF gives highest accuracy, but takes a lot of time to train: 25 minutes and 15 minutes
# 2. Neural Network is weakest
# 3. NB gives satisfactory results within a minute.

trainDF = pd.concat([pd.read_pickle('../../data/2015/MasterData_2015.pkl.gz'),
                    pd.read_pickle('../../data/2014/MasterData_2014.pkl.gz'),
                    pd.read_pickle('../../data/2013/MasterData_2013.pkl.gz'),
                    pd.read_pickle('../../data/2012/MasterData_2012.pkl.gz')])
                                   
trainDF = trainDF[trainDF.TEXT.notna() & trainDF.NTEE.notna()]
trainDF['text'] = trainDF['TEXT'].astype(str)
trainDF['label'] = trainDF['NTEE'].astype(str)
trainDF['category'] = (trainDF['NTEE'].apply(ord)-64).astype('float32')

trainDF = trainDF.drop(['EIN', 'NTEE', 'IRS_URL', 'TEXT','TEXTTYPE', 'YEAR', 'category'], axis=1)
train_df, test_df = np.split(trainDF, [int(.7*len(trainDF))])

#tf.logging.set_verbosity(tf.logging.ERROR)

train_posts = train_df['text']
train_tags = train_df['label']
test_posts = test_df['text']
test_tags = test_df['label']
vocab_size = 1000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(train_posts)

x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

encoder = preprocessing.LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)


In [35]:
#precision & recall by: https://github.com/keras-team/keras/issues/5400

def precision(y_true, y_pred):
    """Precision metric.
    Only computes a batch-wise average of precision.
    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

num_labels=26
batch_size = 500
epochs = 20

model = keras.Sequential()
model.add(keras.layers.Dense(512, input_shape=(vocab_size,)))
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(num_labels))
model.add(keras.layers.Activation('softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              #metrics=['accuracy'],
             metrics=['binary_accuracy', precision, recall])

history = model.fit(x_train, y_train, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    verbose=1, 
                    validation_split=0.1)



Train on 328851 samples, validate on 36539 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [40]:
import datetime, os

results = pd.DataFrame(columns=['classifier', 'accuracy', 'precision', 'recall', 'time'])

for i in range(0,5):
    score = model.evaluate(x_test, y_test, 
                       batch_size=batch_size, verbose=1)
    results.loc[len(results)] = ["NN", score[1], score[2], score[3], 'nan']
    print(score)

if(os.path.exists('../../data/results/classifier_results.pkl.gz')):
    results = pd.concat([pd.read_pickle('../../data/results/classifier_results.pkl.gz'), results]).drop_duplicates()

results.to_pickle('../../data/results/classifier_results.pkl.gz')

[0.7746244333082344, 0.9888899364459225, 0.8865628756221476, 0.8151944162737179]
[0.7746244333082344, 0.9888899364459225, 0.8865628756221476, 0.8151944162737179]
[0.7746244333082344, 0.9888899364459225, 0.8865628756221476, 0.8151944162737179]
[0.7746244333082344, 0.9888899364459225, 0.8865628756221476, 0.8151944162737179]
[0.7746244333082344, 0.9888899364459225, 0.8865628756221476, 0.8151944162737179]


NameError: name 'result' is not defined

In [48]:
import statistics

counts = trainDF['label'].value_counts().sort_index().to_frame()
counts['category'] = counts.index
counts['train_sample']=(counts['label']/2).astype(int)

def dataformodel(trainDF):
    
    train_df, test_df = np.split(trainDF, [int(.7*len(trainDF))])

    #tf.logging.set_verbosity(tf.logging.ERROR)

    train_posts = train_df['text']
    train_tags = train_df['label']
    test_posts = test_df['text']
    test_tags = test_df['label']
    vocab_size = 1000
    tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
    tokenize.fit_on_texts(train_posts)

    x_train = tokenize.texts_to_matrix(train_posts)
    x_test = tokenize.texts_to_matrix(test_posts)

    encoder = preprocessing.LabelBinarizer()
    encoder.fit(train_tags)
    y_train = encoder.transform(train_tags)
    y_test = encoder.transform(test_tags)
    
    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    num_labels=26
    batch_size = 500
    epochs = 20

    model = keras.Sequential()
    model.add(keras.layers.Dense(512, input_shape=(vocab_size,)))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dense(num_labels))
    model.add(keras.layers.Activation('softmax'))

    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  #metrics=['accuracy'],
                 metrics=['binary_accuracy', precision, recall])

    history = model.fit(x_train, y_train, 
                        batch_size=batch_size, 
                        epochs=epochs, 
                        verbose=1, 
                        validation_split=0.1)
    
    score = model.evaluate(x_test, y_test, 
                       batch_size=batch_size, verbose=1)
    return score

def test(train):
    test_posts1 = train['text']
    test_tags1 = train['label']
    x_test1 = tokenize.texts_to_matrix(test_posts1)
    y_test1 = encoder.transform(test_tags1)
    return x_test1, y_test1
    
def get_random(trainDF):
    train = pd.DataFrame()
    for rec in counts.values:
        train=pd.concat([train, trainDF.loc[trainDF['label']==rec[1]].sample(n=rec[2])])
    return train
        
accuracy, precision, recall = [],[],[]
for iterate in range(0, 10):
     #train = train.sample(n=rec[2])
    #train = get_random(trainDF)
    train = trainDF.sample(n=(int(len(trainDF)/3)))
    #x_result, y_result =dataformodel(train)
    x_result, y_result =test(train)
    result = model.evaluate(x_result, y_result, batch_size=batch_size, verbose=1)
    print(result)
    accuracy.append(result[1])
    precision.append(result[2])
    recall.append(result[3])
            
stats = []                                                      
stats.append([statistics.mean(accuracy), statistics.stdev(accuracy)])                                                                                                                                     
stats.append([statistics.mean(precision), statistics.stdev(precision)])                                                                                                                                             
stats.append([statistics.mean(recall), statistics.stdev(recall)])                                                                                                                                         

print(accuracy)
print(precision)
print(recall)
print(stats)
stats = pd.DataFrame(stats)  

'''
if(os.path.exists('../../data/results/classifier_stats_10.pkl.gz')):
    stats = pd.concat([pd.read_pickle('../../data/results/classifier_stats_10.pkl.gz'), stats]).drop_duplicates()
    
stats.to_pickle('../../data/results/classifier_stats_10.pkl.gz')

results = pd.DataFrame(results, columns=['acNB','acRF', 'acNN', 'prNB', 'prRF', 'prNN', 'rcNB', 'rcRF', 'rcNN'])
if(os.path.exists('../../data/results/classifier_results_10.pkl.gz')):
    results = pd.concat([pd.read_pickle('../../data/results/classifier_results_10.pkl.gz'), results]).drop_duplicates()
    
results.to_pickle('../../data/results/classifier_results_10.pkl.gz')
'''

[0.43095523711268136, 0.993348422638044, 0.9414036889135017, 0.8819448832222201]
[0.4250271705163718, 0.9933707478053337, 0.9415287255466647, 0.8824334050643758]
[0.423503542605678, 0.993393739632051, 0.9419808366448787, 0.8826000754023191]
[0.429151980115537, 0.993391968555884, 0.9421158624523012, 0.8824104152005576]
[0.42732889867417445, 0.9934092116645706, 0.941672274936475, 0.8833472237165209]
[0.42944978341935963, 0.9933475421956189, 0.9413402659111507, 0.8820023574998055]
[0.42800540081995436, 0.9933605809270515, 0.941514153332062, 0.8821747774190438]
[0.4246890280011324, 0.9933868870282572, 0.9414356140597921, 0.8829793952817173]
[0.4286768204951965, 0.9933997099692597, 0.9423419012002943, 0.8823701837987146]
[0.42747247811038486, 0.9933824662730736, 0.9416585997718617, 0.8826345623390794]
[0.993348422638044, 0.9933707478053337, 0.993393739632051, 0.993391968555884, 0.9934092116645706, 0.9933475421956189, 0.9933605809270515, 0.9933868870282572, 0.9933997099692597, 0.993382466273

"\nif(os.path.exists('../../data/results/classifier_stats_10.pkl.gz')):\n    stats = pd.concat([pd.read_pickle('../../data/results/classifier_stats_10.pkl.gz'), stats]).drop_duplicates()\n    \nstats.to_pickle('../../data/results/classifier_stats_10.pkl.gz')\n\nresults = pd.DataFrame(results, columns=['acNB','acRF', 'acNN', 'prNB', 'prRF', 'prNN', 'rcNB', 'rcRF', 'rcNN'])\nif(os.path.exists('../../data/results/classifier_results_10.pkl.gz')):\n    results = pd.concat([pd.read_pickle('../../data/results/classifier_results_10.pkl.gz'), results]).drop_duplicates()\n    \nresults.to_pickle('../../data/results/classifier_results_10.pkl.gz')\n"

In [39]:
for i in range(10):    
    prediction = model.predict(np.array([x_test[i]]))
    
    text_labels = encoder.classes_ 
    predicted_label = text_labels[np.argmax(prediction[0])]
    print(test_posts.iloc[i][:100], "...")
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)

TO ADVANCE, PROMOTE, AND SUPPORT STUDIES AND RESEARCH OF THE SOCIAL, ECONOMICAL, POLITICAL, SOCIOLOG ...
Actual label:X
Predicted label: V
THE MISSION OF THE OWENS COMMUNITY COLLEGE FOUNDATION IS TO DEVELOP AND PROVIDE RESOURCES TO ADVANCE ...
Actual label:B
Predicted label: B
SHS provides management services for the delivery of healthcare through its affiliated entities. ...
Actual label:E
Predicted label: E
THE PRINCIPAL PURPOSE OF THE ORGANIZATION, HEREINAFTER REFERRED TO AS THE FOUNDATION, SHALL BE TO FO ...
Actual label:B
Predicted label: B
THE PRINCIPAL PURPOSE OF THE ORGANIZATION, HEREINAFTER REFERRED TO AS THE FOUNDATION, SHALL BE TO FO ...
Actual label:B
Predicted label: B
THE PROMOTION AND DEVELOPMENT OF SCHOLARSHIP, LEADERSHIP SKILLS, CHARACTER, ARTISTIC & PHYSICAL/ATHL ...
Actual label:O
Predicted label: O
THE PURPOSE OF THE FOUNDATION IS TO SUPPORT LAKEVIEW VILLAGE, INC., A 501(C)(3) ORGANIZATION. ...
Actual label:T
Predicted label: T
TO PROVIDE AFFORDABLE, SAFE HOUSING FO