In [3]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas as pd
#Can't use keras on 129.114.32.33:8000
#from keras import layers, models, optimizers
import datetime, os

#to-do list
#1. Record the amount of time a classifier takes: start(timestamp) - end(timestamp), put it in .pkl.gz

#Observations:
# 1. RF gives highest accuracy, but takes a lot of time to train: 25 minutes and 15 minutes
# 2. Neural Network is weakest
# 3. NB gives satisfactory results within a minute.

trainDF = pd.concat([pd.read_pickle('../../data/2015/MasterData_2015.pkl.gz'),
                    pd.read_pickle('../../data/2014/MasterData_2014.pkl.gz'),
                    pd.read_pickle('../../data/2013/MasterData_2013.pkl.gz'),
                    pd.read_pickle('../../data/2012/MasterData_2012.pkl.gz')])
                                   
trainDF = trainDF[trainDF.TEXT.notna() & trainDF.NTEE.notna()]
trainDF['text'] = trainDF['TEXT'].astype(str)
trainDF['label'] = trainDF['NTEE'].astype(str)

'''
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    
    time1 = datetime.datetime.now()
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return [metrics.accuracy_score(predictions, valid_y), 
            metrics.precision_score(predictions, valid_y, average='weighted'), 
            metrics.recall_score(predictions, valid_y, average='weighted'),
            datetime.datetime.now()-time1]

results = pd.DataFrame(columns=['classifier', 'accuracy', 'precision', 'recall', 'time'])

# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
results.loc[len(results)] = ["NB, Count Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
results.loc[len(results)] = ["NB, WordLevel TF-IDF", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
results.loc[len(results)] = ["NB, N-Gram Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
results.loc[len(results)] = ["NB, CharLevel Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, CharLevel Vectors: ", accuracy)

# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
results.loc[len(results)] = ["RF, Count Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
results.loc[len(results)] = ["RF, WordLevel TF-IDF", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("RF, WordLevel TF-IDF: ", accuracy)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 


classifier = create_model_architecture(xtrain_count.shape[1])
accuracy = train_model(classifier, xtrain_count, train_y, xvalid_count, is_neural_net=True)
results.loc[len(results)] = ["NN, Count Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NN, Count Vectors",  accuracy)

classifier = create_model_architecture(xtrain_tfidf_ngram_chars.shape[1])
accuracy = train_model(classifier, xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars, is_neural_net=True)
results.loc[len(results)] = ["NN, CharLevel Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NN, CharLevel Vectors",  accuracy)

classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])
accuracy = train_model(classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
results.loc[len(results)] = ["NN, Ngram Level TF IDF Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NN, Ngram Level TF IDF Vectors",  accuracy)

classifier = create_model_architecture(xtrain_tfidf.shape[1])
accuracy = train_model(classifier, xtrain_tfidf, train_y, xvalid_tfidf, is_neural_net=True)
results.loc[len(results)] = ["NN, World Level TF-IDF", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NN, World Level TF-IDF",  accuracy)


if(os.path.exists('../../data/results/classifier_results.pkl.gz')):
    results = pd.concat([pd.read_pickle('../../data/results/classifier_results.pkl.gz'), results]).drop_duplicates()

results.to_pickle('../../data/results/classifier_results.pkl.gz')
'''

  from numpy.core.umath_tests import inner1d


'\n# split the dataset into training and validation datasets \ntrain_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF[\'text\'], trainDF[\'label\'])\n\n# create a count vectorizer object \ncount_vect = CountVectorizer(analyzer=\'word\', token_pattern=r\'\\w{1,}\')\ncount_vect.fit(trainDF[\'text\'])\n\n# transform the training and validation data using count vectorizer object\nxtrain_count =  count_vect.transform(train_x)\nxvalid_count =  count_vect.transform(valid_x)\n\n# word level tf-idf\ntfidf_vect = TfidfVectorizer(analyzer=\'word\', token_pattern=r\'\\w{1,}\', max_features=5000)\ntfidf_vect.fit(trainDF[\'text\'])\nxtrain_tfidf =  tfidf_vect.transform(train_x)\nxvalid_tfidf =  tfidf_vect.transform(valid_x)\n\n# ngram level tf-idf \ntfidf_vect_ngram = TfidfVectorizer(analyzer=\'word\', token_pattern=r\'\\w{1,}\', ngram_range=(2,3), max_features=5000)\ntfidf_vect_ngram.fit(trainDF[\'text\'])\nxtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)\nxvalid_tfi

In [4]:
import tensorflow as tf
from tensorflow import keras, Session
import numpy as np

imdb = keras.datasets.imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

print(type(train_data),test_labels)
#print(train_x.values[0])

sets = tf.estimator.inputs.pandas_input_fn(
    x=pd.DataFrame(trainDF['text']),
    y=pd.DataFrame(trainDF['label']),
    batch_size=256,
    num_epochs=1,
    shuffle=True,
    queue_capacity=1000,
    num_threads=1,
    target_column='target')()

from sklearn.cross_validation import train_test_split


X_train, X_test, y_train, y_test = train_test_split(trainDF['text'], trainDF['label'], test_size = 0.25, random_state = 0)


# write an input function
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=10, num_epochs=1000, shuffle=True)
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, y=y_test, batch_size=10, num_epochs=1, shuffle=False)


<class 'numpy.ndarray'> [0 1 1 ... 0 0 0]


