In [2]:
# use natural language toolkit
import nltk
from nltk.stem.lancaster import LancasterStemmer
import os
import json
import datetime
import numpy as np
import re
import itertools
from collections import Counter
import time
import datetime
import pickle

import tensorflow as tf
import numpy as np
stemmer = LancasterStemmer()

In [7]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels(root_data_folder, saved_file):
    """
    Loads 20news group dataset data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """

    # If file is saved then just load the file
    if os.path.isfile(saved_file):
        x_text, y, y_label = load_data(saved_file)
        return [x_text, y, y_label]

    else:
        # Load data from files
        x_text = []
        y_label = []
        y_textual_label = []
        counter = 0
        for folder_name in os.listdir(root_data_folder):
            if not folder_name.startswith('.'):
                for file_name in os.listdir(os.path.join(root_data_folder, folder_name)):

                    examples = open(os.path.join(root_data_folder, folder_name, file_name),
                                    mode='r', encoding='utf-8', errors='ignore').read().strip()

                    # Split by words
                    x_text.append(clean_str(examples))
                    label = [0] * 20
                    label[counter] = 1
                    y_label.append(label)
                    y_textual_label.append(folder_name)
                counter += 1

        y = np.concatenate([y_label], 0)
        save_data([x_text, y, y_textual_label], saved_file)
        return [x_text, y, y_textual_label]


def load_data(file_name):
    with open(os.path.abspath(file_name), 'rb') as f:
        x_text, y, y_label = pickle.load(f)
        return [x_text, y, y_label]


def save_data(data, file_name):
    with open(os.path.abspath(file_name), 'wb') as f:
        pickle.dump(data, f)

In [9]:
# Load data
root_data_folder = '../../data/20news-18828'
saving_data_file = '../../data/preloaded/20news_18828.dt'
saving_words_data_file = '../../data/preloaded/20news_all_words.dt'
x_text, y, y_label = load_data_and_labels(root_data_folder, saving_data_file)

words = []
ignore_words = ['?']

# If file is saved then just load the file
if os.path.isfile(saving_words_data_file):
    with open(os.path.abspath(saving_words_data_file), 'rb') as f:
        words = pickle.load(f)

else:
    # loop through each documents in our training data
    for text, y_, label in zip(x_text, y, y_label):
        # tokenize each word in the document
        w = nltk.word_tokenize(text)
        # add to our words list
        words.extend(w)

    # stem and lower each word and remove duplicates
    words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
    words = list(set(words))
    with open(os.path.abspath(saving_words_data_file), 'wb') as f:
        pickle.dump(words, f)

classes = []
documents = []

# loop through each documents in our training data
for text, label in zip(x_text, y_label):

    # add to documents in our corpus
    documents.append((text, label))
    # add to our classes list
    if label not in classes:
        classes.append(label)


# remove duplicates
classes = list(set(classes))

#
print(len(documents), " documents")
print(len(classes), " classes", classes)
print(len(words), " unique stemmed words")

18828  documents
20  classes ['rec.sport.baseball', 'sci.crypt', 'comp.os.ms-windows.misc', 'rec.autos', 'talk.politics.mideast', 'misc.forsale', 'sci.electronics', 'talk.politics.misc', 'comp.sys.ibm.pc.hardware', 'soc.religion.christian', 'comp.graphics', 'talk.politics.guns', 'sci.space', 'comp.windows.x', 'rec.sport.hockey', 'sci.med', 'comp.sys.mac.hardware', 'talk.religion.misc', 'alt.atheism', 'rec.motorcycles']
137817  unique stemmed words


In [None]:

# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each document
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    training.append(bag)
    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    output.append(output_row)

print ("# words", len(words))
print ("# classes", len(classes))

In [6]:
# sample training/output
i = 0
w = documents[i][0]
print ([stemmer.stem(word.lower()) for word in w])
print (training[i])
print (output[i])

['how', 'ar', 'you', '?']
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0]


In [7]:
import numpy as np
import time

# compute sigmoid nonlinearity
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

# convert output of sigmoid function to its derivative
def sigmoid_output_to_derivative(output):
    return output*(1-output)
 
def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

def think(sentence, show_details=False):
    x = bow(sentence.lower(), words, show_details)
    if show_details:
        print ("sentence:", sentence, "\n bow:", x)
    # input layer is our bag of words
    l0 = x
    # matrix multiplication of input and hidden layer
    l1 = sigmoid(np.dot(l0, synapse_0))
    # output layer
    l2 = sigmoid(np.dot(l1, synapse_1))
    return l2


In [8]:
# ANN and Gradient Descent code from https://iamtrask.github.io//2015/07/27/python-network-part2/
def train(X, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=False, dropout_percent=0.5):

    print ("Training with %s neurons, alpha:%s, dropout:%s %s" % (hidden_neurons, str(alpha), dropout, dropout_percent if dropout else '') )
    print ("Input matrix: %sx%s    Output matrix: %sx%s" % (len(X),len(X[0]),1, len(classes)) )
    np.random.seed(1)

    last_mean_error = 1
    # randomly initialize our weights with mean 0
    synapse_0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1
    synapse_1 = 2*np.random.random((hidden_neurons, len(classes))) - 1

    prev_synapse_0_weight_update = np.zeros_like(synapse_0)
    prev_synapse_1_weight_update = np.zeros_like(synapse_1)

    synapse_0_direction_count = np.zeros_like(synapse_0)
    synapse_1_direction_count = np.zeros_like(synapse_1)
        
    for j in iter(range(epochs+1)):

        # Feed forward through layers 0, 1, and 2
        layer_0 = X
        layer_1 = sigmoid(np.dot(layer_0, synapse_0))
                
        if(dropout):
            layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))

        layer_2 = sigmoid(np.dot(layer_1, synapse_1))

        # how much did we miss the target value?
        layer_2_error = y - layer_2

        if (j% 10000) == 0 and j > 5000:
            # if this 10k iteration's error is greater than the last iteration, break out
            if np.mean(np.abs(layer_2_error)) < last_mean_error:
                print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) )
                last_mean_error = np.mean(np.abs(layer_2_error))
            else:
                print ("break:", np.mean(np.abs(layer_2_error)), ">", last_mean_error )
                break
                
        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2)

        # how much did each l1 value contribute to the l2 error (according to the weights)?
        layer_1_error = layer_2_delta.dot(synapse_1.T)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)
        
        synapse_1_weight_update = (layer_1.T.dot(layer_2_delta))
        synapse_0_weight_update = (layer_0.T.dot(layer_1_delta))
        
        if(j > 0):
            synapse_0_direction_count += np.abs(((synapse_0_weight_update > 0)+0) - ((prev_synapse_0_weight_update > 0) + 0))
            synapse_1_direction_count += np.abs(((synapse_1_weight_update > 0)+0) - ((prev_synapse_1_weight_update > 0) + 0))        
        
        synapse_1 += alpha * synapse_1_weight_update
        synapse_0 += alpha * synapse_0_weight_update
        
        prev_synapse_0_weight_update = synapse_0_weight_update
        prev_synapse_1_weight_update = synapse_1_weight_update

    now = datetime.datetime.now()

    # persist synapses
    synapse = {'synapse0': synapse_0.tolist(), 'synapse1': synapse_1.tolist(),
               'datetime': now.strftime("%Y-%m-%d %H:%M"),
               'words': words,
               'classes': classes
              }
    synapse_file = "synapses.json"

    with open(synapse_file, 'w') as outfile:
        json.dump(synapse, outfile, indent=4, sort_keys=True)
    print ("saved synapses to:", synapse_file)

In [9]:
X = np.array(training)
y = np.array(output)

start_time = time.time()

train(X, y, hidden_neurons=20, alpha=0.1, epochs=100000, dropout=False, dropout_percent=0.2)

elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")

Training with 20 neurons, alpha:0.1, dropout:False 
Input matrix: 12x26    Output matrix: 1x3
delta after 10000 iterations:0.00666786086319
delta after 20000 iterations:0.0045541143638
delta after 30000 iterations:0.00365376837861
delta after 40000 iterations:0.00312820414223
delta after 50000 iterations:0.00277455420548
delta after 60000 iterations:0.00251621748853
delta after 70000 iterations:0.00231708959218
delta after 80000 iterations:0.00215766231098
delta after 90000 iterations:0.00202636101423
delta after 100000 iterations:0.00191583201544
saved synapses to: synapses.json
processing time: 6.587924957275391 seconds


In [16]:
# probability threshold
ERROR_THRESHOLD = 0.2
# load our calculated synapse values
synapse_file = 'synapses.json' 
with open(synapse_file) as data_file: 
    synapse = json.load(data_file) 
    synapse_0 = np.asarray(synapse['synapse0']) 
    synapse_1 = np.asarray(synapse['synapse1'])

def classify(sentence, show_details=False):
    results = think(sentence, show_details)

    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ] 
    results.sort(key=lambda x: x[1], reverse=True) 
    return_results =[[classes[r[0]],r[1]] for r in results]
    print ("%s \n classification: %s" % (sentence, return_results))
    return return_results

classify("sudo make me a sandwich")
classify("how are you today?")
classify("talk to you tomorrow")
classify("who are you?")
classify("make me some lunch")
print ()
classify("how was your lunch?", show_details=True)

[[1, 0.99819938892600879]]
[[1, 0.99819938892600879]]
sudo make me a sandwich 
 classification: [['sandwich', 0.99819938892600879]]
[[0, 0.99814068207517803]]
[[0, 0.99814068207517803]]
how are you today? 
 classification: [['greeting', 0.99814068207517803]]
[[2, 0.98867670802444763]]
[[2, 0.98867670802444763]]
talk to you tomorrow 
 classification: [['goodbye', 0.98867670802444763]]
[[0, 0.89982215671891475]]
[[0, 0.89982215671891475]]
who are you? 
 classification: [['greeting', 0.89982215671891475]]
[[1, 0.97520886241661375]]
[[1, 0.97520886241661375]]
make me some lunch 
 classification: [['sandwich', 0.97520886241661375]]

found in bag: how
found in bag: yo
found in bag: lunch
sentence: how was your lunch? 
 bow: [0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
[[0, 0.92994974842438971], [1, 0.26270575871767921]]
[[0, 0.92994974842438971], [1, 0.26270575871767921]]
how was your lunch? 
 classification: [['greeting', 0.92994974842438971], ['sandwich', 0.26270575871767921]]


[['greeting', 0.92994974842438971], ['sandwich', 0.26270575871767921]]