In [2]:
import numpy as np
import pandas
from sklearn import metrics
import tensorflow as tf
import tensorflow.contrib.learn as learn
from tensorflow.contrib import rnn

# Sentences Example
Demonstrates using a RRN-LSTM to classify sentences into categories .

# Data

## Retrieval
Get a dataset of words from online.

In [None]:
# Pass the empty string as the size to download the whole dataset.
dbpedia = learn.datasets.load_dataset('dbpedia', size='')  

Now, prepare the training and testing data, and their corresponding labels (targets).

In [3]:
x_train = pandas.DataFrame(dbpedia.train.data)[1]  
y_train = pandas.Series(dbpedia.train.target) 

x_test = pandas.DataFrame(dbpedia.test.data)[1]  
y_test = pandas.Series(dbpedia.test.target)   

NameError: name 'dbpedia' is not defined

## Representing Words
- Words are represented as integer ids.
- Each word has a unique id.
- `MAX_SENTENCE_LENGTH` is required as each sentence must be the same length.
    - Sentences that are too short are padded.
    - Sentences that are too long are trimmed.

In [None]:
MAX_SENTENCE_LENGTH = 10

vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_SENTENCE_LENGTH)  

x_train = np.array(list(vocab_processor.fit_transform(x_train))) 
x_test = np.array(list(vocab_processor.transform(x_test)))
n_words = len(vocab_processor.vocabulary_)

print('Total words: %d' % n_words)

# Network

In [1]:
EMBEDDING_SIZE = 50

def rnn_model(features, target):  
    """
    RNN model to predict from sequence of words to a class.
    """  
    # Convert indexes of words into embeddings.  
    # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and
    # then maps word indexes of the sequence into [batch_size, 
    # sequence_length, EMBEDDING_SIZE].  
    word_vectors = tf.contrib.layers.embed_sequence(features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words')   
      
    # Split into list of embedding per word, while removing doc length
    # dim. word_list results to be a list of tensors [batch_size, 
    # EMBEDDING_SIZE].  
    word_list = tf.unstack(word_vectors, axis=1)
    
    # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
    cell = rnn.GRUCell(EMBEDDING_SIZE)   
  
    # Create an unrolled Recurrent Neural Networks to length of  
    # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each 
    # unit.  
    _, encoding = rnn.static_rnn(cell, word_list, dtype=tf.float32)   
    
    # Given encoding of RNN, take encoding of last step (e.g hidden 
    # size of the neural network of last step) and pass it as features 
    # to fully connected layer to output probabilities per class.  
    target = tf.one_hot(target, 15, 1, 0)  
    logits = tf.contrib.layers.fully_connected(encoding, 15, activation_fn=None)  
    loss = tf.contrib.losses.softmax_cross_entropy(logits, target)   
  
    # Create a training op.
    train_op = tf.contrib.layers.optimize_loss(loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01, clip_gradients=1.0)   
  
    return ({
        'class': tf.argmax(logits, 1), 
        'prob': tf.nn.softmax(logits)},      
         loss,
         train_op)

In [2]:
classifier = learn.Estimator(model_fn=rnn_model) 
# Train and predict 
classifier.fit(x_train, y_train, steps=10000) 
y_predicted = [ p['class'] for p in 
  classifier.predict(x_test, as_iterable=True)] 
score = metrics.accuracy_score(y_test, y_predicted) 

print('Accuracy: {0:f}'.format(score))

NameError: name 'learn' is not defined