In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


In [2]:
EMBEDDING_SIZE = 50
n_words = 0
MAX_LABEL = 15
WORDS_FEATURE = 'words'      # None of the input words features

In [3]:
dbpedia = tf.contrib.learn.datasets.load_dataset(
                'dbpedia', size = 'small', test_with_fake_data = False)

In [4]:
len(dbpedia.train.data)

560

In [5]:
len(dbpedia.test.data)

70

In [6]:
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(dbpedia.train.data)))

data_shuffled = dbpedia.train.data[shuffle_indices]
target_shuffled = dbpedia.train.target[shuffle_indices]

train_data = data_shuffled[:]
train_target = target_shuffled[:]

In [7]:
shuffle_indices = np.random.permutation(np.arange(len(dbpedia.test.data)))

data_shuffled = dbpedia.test.data[shuffle_indices]
target_shuffled = dbpedia.test.target[shuffle_indices]


test_data = data_shuffled[:]
test_target = target_shuffled[:]

In [8]:
train_data[:5]

array([['Alina Orlova',
        ' Alina Orlova (lithuanian. Alina Orlovskaja polish Alina Orłowska born June 28 1988) is a Lithuanian sung poetry singer and musician. She is of mixed Polish-Russian heritage.'],
       ['HMS Carysfort (1836)',
        ' HMS Carysfort was a sixth-rate sailing frigate of the Royal Navy launched in 1836 and named for the Earl of Carysfort who had been a former (civilian) Lord of the Admiralty. Her captain Lord George Paulet occupied the Hawaiian Islands for five months in 1843. She was decommissioned in 1847 and finally broken up in 1861.'],
       ['Chinspot Batis',
        ' The chinspot batis (Batis molitor) is a species of bird in the Platysteiridae family.It is found in Angola Botswana Burundi Republic of the Congo Democratic Republic of the Congo Gabon Kenya Lesotho Malawi Mozambique Namibia Rwanda South Africa Sudan Swaziland Tanzania Uganda Zambia and Zimbabwe.Its natural habitats are subtropical or tropical dry forests subtropical or tropical mois

In [9]:
np.unique(dbpedia.train.target)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int32)

In [10]:
x_train = pd.Series(train_data[:, 1])
y_train = pd.Series(train_target)

x_test = pd.Series(test_data[:, 1])
y_test = pd.Series(test_target)

In [11]:
max_document_length_train = max(len(x.split(" ")) for x in x_train)
max_document_length_test = max(len(x.split(" ")) for x in x_test)

print(max_document_length_train, max_document_length_test)

117 83


In [12]:
MAX_DOCUMENT_LENGTH = max(max_document_length_train, 
                          max_document_length_test)

In [13]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
                    MAX_DOCUMENT_LENGTH)

In [14]:
x_train = np.array(list(vocab_processor.fit_transform(x_train)))
x_test = np.array(list(vocab_processor.fit_transform(x_test)))

In [15]:
x_train[:3]

array([[  1,   2,   3,   1,   4,   5,   1,   6,   7,   8,   9,  10,  11,
         12,  13,  14,  15,  16,  17,  18,  19,  11,  20,  21,  22,  23,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 24,  25,  26,  12,  27,  28,  29,  20,  30,  31,  32,  33,  34,
         35,  17,  36,  37,  30,  38,  20,  25,  39,  40,  41,  12,  42,
         43,  44,  20,  30,  45,  46,  47,  44,  48,  49,  50,  30,  51,
         52,  37,  53,  54,  34,  55,  19,  26,  56,  34,  57,  17,  58,
         59,  60,  34,  61,   0,   0,   0,   0,   

In [16]:
train_data[:3]

array([['Alina Orlova',
        ' Alina Orlova (lithuanian. Alina Orlovskaja polish Alina Orłowska born June 28 1988) is a Lithuanian sung poetry singer and musician. She is of mixed Polish-Russian heritage.'],
       ['HMS Carysfort (1836)',
        ' HMS Carysfort was a sixth-rate sailing frigate of the Royal Navy launched in 1836 and named for the Earl of Carysfort who had been a former (civilian) Lord of the Admiralty. Her captain Lord George Paulet occupied the Hawaiian Islands for five months in 1843. She was decommissioned in 1847 and finally broken up in 1861.'],
       ['Chinspot Batis',
        ' The chinspot batis (Batis molitor) is a species of bird in the Platysteiridae family.It is found in Angola Botswana Burundi Republic of the Congo Democratic Republic of the Congo Gabon Kenya Lesotho Malawi Mozambique Namibia Rwanda South Africa Sudan Swaziland Tanzania Uganda Zambia and Zimbabwe.Its natural habitats are subtropical or tropical dry forests subtropical or tropical mois

In [17]:
n_words = len(vocab_processor.vocabulary_)

print("Total words : ", n_words)

Total words :  7552


In [18]:
tf.reset_default_graph()

In [19]:
def estimator_spec_for_softmax_classification(logits, labels, mode):
    
    #Prediction mode
    predicted_classes = tf.argmax(logits, 1)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
                mode = mode,
                predictions={
                    'class': predicted_classes,
                    'prob' : tf.nn.softmax(logits)
                })
    
    #Training Mode
    onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
    loss = tf.losses.softmax_cross_entropy(
            onehot_labels = onehot_labels, logits = logits)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss = loss, train_op=train_op)
    
    # Evaluation Mode
    eval_metric_ops = {
        'accuracy' : tf.metrics.accuracy(
                labels=labels, predictions=predicted_classes)
    } 
    
    return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [20]:
def rnn_model(features, labels, mode):
    word_vector = tf.contrib.layers.embed_sequence(
        features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
    
    word_list = tf.unstack(word_vector, axis = 1)
    
    cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)
    _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32)
    
    logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
    
    return estimator_spec_for_softmax_classification(
            logits = logits, labels = labels, mode = mode)

In [21]:
clasifier = tf.estimator.Estimator(model_fn=rnn_model)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpyw3n7tje', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe46134c128>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [29]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x = {WORDS_FEATURE : x_train},
                    y = y_train,
                    batch_size = len(x_train),
                    num_epochs = 10,
                    shuffle = True
)

clasifier.train(input_fn=train_input_fn, steps = 100)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpyw3n7tje/model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 101 into /tmp/tmpyw3n7tje/model.ckpt.
INFO:tensorflow:loss = 0.007276279, step = 101
INFO:tensorflow:Saving checkpoints for 110 into /tmp/tmpyw3n7tje/model.ckpt.
INFO:tensorflow:Loss for final step: 0.005432175.


<tensorflow.python.estimator.estimator.Estimator at 0x7fe46134c8d0>

In [23]:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x = {WORDS_FEATURE : x_test},
                    y = y_test,
                    num_epochs = 1,
                    shuffle = False
)

predict = clasifier.predict(input_fn=test_input_fn)

In [24]:
y_predict = np.array(list(p['class'] for p in predict))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpyw3n7tje/model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [25]:
y_predict = y_predict.reshape(np.array(y_test).shape)

In [26]:
y_predict

array([ 2,  8,  4,  1,  3,  7,  7,  6, 11,  8,  5,  8, 11,  8,  9, 12, 11,
       11, 14,  3,  2, 11,  7,  9, 10,  3,  1,  5, 12,  9, 12,  3,  3,  1,
        5,  1,  1,  5, 11,  7, 11,  9,  7,  8, 12,  1,  6,  1,  6,  4,  7,
        4, 11, 11, 11, 11,  1,  5, 12,  1,  1, 12, 12, 11, 10,  3,  5,  8,
        2, 12])

In [27]:
from sklearn import metrics

In [28]:
score = metrics.accuracy_score(y_test, y_predict)

print('Accuracy (sklearn) : {0:f}'.format(score * 100))

Accuracy (sklearn) : 41.428571
