In [32]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 

import pandas as pd
#Can't use keras on 129.114.32.33:8000
#from keras import layers, models, optimizers
import datetime, os

#to-do list
#1. Record the amount of time a classifier takes: start(timestamp) - end(timestamp), put it in .pkl.gz

#Observations:
# 1. RF gives highest accuracy, but takes a lot of time to train: 25 minutes and 15 minutes
# 2. Neural Network is weakest
# 3. NB gives satisfactory results within a minute.

trainDF = pd.concat([pd.read_pickle('../../dataset/raw_data/MasterData_2015.pkl.gz'),
                    pd.read_pickle('../../dataset/raw_data/MasterData_2014.pkl.gz'),
                    pd.read_pickle('../../dataset/raw_data/MasterData_2013.pkl.gz'),
                    pd.read_pickle('../../dataset/raw_data/MasterData_2012.pkl.gz')])
                                   
trainDF = trainDF[trainDF.TEXT.notna() & trainDF.NTEE.notna()]
trainDF['text'] = trainDF['TEXT'].astype(str)
trainDF['label'] = trainDF['NTEE'].astype(str)


# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])


#Source: https://scikit-learn.org/stable/modules/feature_extraction.html
#Use NLTK's Lemmatizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

#Uncomment the line below to implement lemmatizer
#count_vect = CountVectorizer(tokenizer=LemmaTokenizer())


#Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [PorterStemmer().stem(word) for word in words]
    return words

# Uncomment the line below to implement Porter Stemmer
#count_vec = CountVectorizer(stop_words='english', tokenizer=stemming_tokenizer)

#Comment the line below to implement any other vectorizer - Lemmatizer or PorterStemmer
count_vect = CountVectorizer((analyzer='word', token_pattern=r'\w{1,}')


count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    
    time1 = datetime.datetime.now()
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return [metrics.accuracy_score(predictions, valid_y), 
            metrics.precision_score(predictions, valid_y, average='weighted'), 
            metrics.recall_score(predictions, valid_y, average='weighted'),
            datetime.datetime.now()-time1]

results = pd.DataFrame(columns=['classifier', 'accuracy', 'precision', 'recall', 'time'])

# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
results.loc[len(results)] = ["NB, Count Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
results.loc[len(results)] = ["NB, WordLevel TF-IDF", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
results.loc[len(results)] = ["NB, N-Gram Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
results.loc[len(results)] = ["NB, CharLevel Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, CharLevel Vectors: ", accuracy)

# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
results.loc[len(results)] = ["RF, Count Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
results.loc[len(results)] = ["RF, WordLevel TF-IDF", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("RF, WordLevel TF-IDF: ", accuracy)




  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'recall', 'true', average, warn_for)


NB, Count Vectors:  [0.6484458458251491, 0.6835115537069618, 0.6484458458251491, datetime.timedelta(0, 2, 868769)]
NB, WordLevel TF-IDF:  [0.6312329852855026, 0.6750129621794235, 0.6312329852855026, datetime.timedelta(0, 2, 735714)]
NB, N-Gram Vectors:  [0.5231141665805162, 0.5905610434236662, 0.5231141665805162, datetime.timedelta(0, 2, 586742)]
NB, CharLevel Vectors:  [0.5912764051138909, 0.6307783152329384, 0.5912764051138909, datetime.timedelta(0, 4, 253625)]


KeyboardInterrupt: 