In [116]:
import pickle
import numpy as np
import pandas as pd

In [117]:
BOFevent = np.array( pickle.load(open('labeled_Data/labelsBOF.p', 'rb')) ).astype(int)
BOFCarpool=np.array( pickle.load(open('labeled_Data/labelsBOFCarpool.p','rb')) ).astype(int)

Converting all the True/Falses into 1's and 0's. From here we will multiply the carpool event by 2 in order to change it's label and add the two together to have 1 array that's to be one-hot encoded later.

In [118]:
Y = BOFCarpool*2+BOFevent
Y_one_hot = np.zeros((len(Y), len(set(Y))))

In [124]:
for index, y in enumerate(Y):
    Y_one_hot[index, y] = 1

In [127]:
print(Y_one_hot, '\n')
print(Y)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]] 

[0 1 2 ... 0 1 1]


Next we'll take the events that the Y's are associated with and convert them to a bag of words model. We'll use this model because the order is not necessary.

In [63]:
events=np.array(pd.read_pickle( 'labeled_Data/cleanedEvents.p'))

In [130]:
dictionary = {'UNK':0}

In [131]:
events[0].split()

['Three', "King's", 'Day', 'Party']

In [132]:
counter = 1
for event in events:
    for word in event.lower().split():
        if word not in dictionary:
            dictionary[word] = counter
            counter+=1

In [133]:
len(dictionary) #number of tokens in our dictionary

1855

In [134]:
len(events) #number of data points we have

1163

In [135]:
bag_of_words = np.zeros((len(events), len(dictionary))).astype(int)

In [136]:
for index, event in enumerate(events):
    for word in event.lower().split():
        word_index = dictionary[word]
        bag_of_words[index, word_index] +=1

We now have our bag of words. We can construct a simple neural network to determine whether an event is irrelevant, BOF or BOFCarpool.

In [137]:
import tensorflow as tf
import keras 
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(bag_of_words, Y_one_hot, test_size = 0.1)

In [214]:
model = Sequential()
model.add(Dense(500, input_shape = (len(dictionary), ) ))
model.add(Activation('relu'))
model.add(Dropout(0.8))
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.8))
model.add(Dense(Y_one_hot.shape[1]))
model.add(Activation('softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'Adam', metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_53 (Dense)             (None, 500)               928000    
_________________________________________________________________
activation_52 (Activation)   (None, 500)               0         
_________________________________________________________________
dropout_29 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_54 (Dense)             (None, 500)               250500    
_________________________________________________________________
activation_53 (Activation)   (None, 500)               0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_55 (Dense)             (None, 3)                 1503      
__________

In [215]:
fit = model.fit(X_train, Y_train, batch_size=32, epochs=10, verbose=1)

score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.04121581191181117
Test accuracy: 0.9914529919624329


For a quick neural network, the best I was able to get was 99.1% in 10 epochs. Not perfect but pretty close! Next to boost the performance, we can go to term frequency-inverse document frequency (TF-IDF). Words that are use a lot don't have as strong a significance as words that rarely appear.

In [201]:
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer

def tfidf(data):
    
    tfidf_transformer = TfidfTransformer()
    data_tfidf = tfidf_transformer.fit_transform(data)

    return data_tfidf, tfidf_transformer

In [202]:
X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [237]:
model = Sequential()
model.add(Dense(500, input_shape = (len(dictionary), ) ))
model.add(Activation('relu'))
model.add(Dropout(0.75))
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.75))
model.add(Dense(Y_one_hot.shape[1]))
model.add(Activation('softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'Adam', metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_83 (Dense)             (None, 500)               928000    
_________________________________________________________________
activation_82 (Activation)   (None, 500)               0         
_________________________________________________________________
dropout_49 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_84 (Dense)             (None, 500)               250500    
_________________________________________________________________
activation_83 (Activation)   (None, 500)               0         
_________________________________________________________________
dropout_50 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_85 (Dense)             (None, 3)                 1503      
__________

In [238]:
fit = model.fit(X_train_tfidf, Y_train, batch_size=32, epochs=10, verbose=1)

score = model.evaluate(X_test_tfidf, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.051234409467786804
Test accuracy: 0.9829059839248657


Moving to TF-IDF space, we seem to max out accuracy at 98.3%.