In [1]:
# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np
import random
import functools
import math
import sys
import pandas as pd

In [2]:
train = pd.read_csv('./en_train.csv', sep=',', encoding='utf8',
    dtype={'sentence_id': np.int32, 'token_id': np.int32, 'class': unicode, 'before': unicode, 'after': unicode})
print train.size
train.dropna(axis=0, how='any', inplace=True)
print train.size

49592205
49591950


In [3]:
train['before'] = train.before.astype(unicode)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9918390 entries, 0 to 9918440
Data columns (total 5 columns):
sentence_id    int32
token_id       int32
class          object
before         object
after          object
dtypes: int32(2), object(3)
memory usage: 378.4+ MB


In [5]:
tf.reset_default_graph()

class LabelClassifier:

    def __init__(self, data, num_hidden=64, seq_len=100, batch_size=1000, num_layers=3):
        self._data = data
        self._num_hidden = num_hidden
        self._seq_len = seq_len
        self._batch_size = batch_size
        self._num_layers = num_layers
        self._learning_rate = .003
        self._max_grad_norm = .5
        self._embedding_size = 32

        self._init_dict()
        self._init_classes()

        self._input = tf.placeholder(tf.float32, [None, self._seq_len, self._embedding_size], name='input')
        self._target = tf.placeholder(tf.float32, [None, self._num_classes], name='target')

        self._init_length()
        self._init_prediction()
        self._init_cost()
        self._init_error()
        self._init_optimize()

    def _init_dict(self):
        self._vocab_size = 0
        self._char_to_id = {}
        self._char_to_id[u'\u0000'] = 0
        for text in self._data['before']:
            for c in text:
                if c not in self._char_to_id:
                    id = len(self._char_to_id)
                    self._char_to_id[c] = id
        self._vocab_size = len(self._char_to_id)
        self._embeddings = np.random.random([self._vocab_size, self._embedding_size])
        print 'vocabulary size: ', self._vocab_size


    def _init_classes(self):
        self._class_to_one_hot = {}
        self._num_classes = 0
        class_list = self._data['class'].unique()
        class_one_hots = np.eye(len(class_list))
        for i in range(len(class_one_hots)):
            self._class_to_one_hot[class_list[i]] = class_one_hots[i,:]
        self._num_classes = len(class_one_hots)
        print 'number of classed: ', self._num_classes


    def _init_length(self):
        used = tf.sign(tf.reduce_max(tf.abs(self._input), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        self._length = tf.cast(length, tf.int32)


    def _init_prediction(self):
        # Dimensions
        self._max_length = int(self._input.get_shape()[1])
        batch_size = tf.shape(self._input)[0]

        # Recurrent network
        with tf.variable_scope('rnn'):
            cells = []
            for _ in range(self._num_layers):
                cells.append(tf.contrib.rnn.GRUCell(self._num_hidden))
            cell = tf.contrib.rnn.MultiRNNCell(cells)

            states = cell.zero_state(batch_size, tf.float32)
            state_type = type(states)
            self._initial_state = [
                tf.placeholder_with_default(zero_state, [None, self._num_hidden]) for zero_state in states]
            self._initial_state = state_type(self._initial_state)

            self._output, self._final_state = tf.nn.dynamic_rnn(cell, self._input,
                                                                dtype=tf.float32, sequence_length=self._length,
                                                                initial_state=self._initial_state)

        # Get relevant output
        index = tf.range(0, batch_size) * self._max_length + (self._length - 1)
        flat = tf.reshape(self._output, [-1, self._num_hidden])
        relevant = tf.gather(flat, index)

        # Softmax layer
        with tf.variable_scope('softmax'):
            weight = tf.get_variable('W', [self._num_hidden, self._num_classes])
            bias = tf.get_variable('b', [self._num_classes], initializer=tf.constant_initializer(0.1))

            self._logits = tf.matmul(relevant, weight) + bias
            self._prediction = tf.nn.softmax(self._logits)

        # Summarize weigths
        tf.summary.histogram("rnn_output", self._output)
        tf.summary.histogram("softmax_weights", weight)
        tf.summary.histogram("softmax_bias", bias)
        tf.summary.histogram("prediction", self._prediction)

        
    def _init_cost(self):
        with tf.variable_scope('cost'):
            self._cost = tf.losses.softmax_cross_entropy(self._target, self._logits)
        tf.summary.scalar('cost', self._cost)


    def _init_error(self):
        with tf.variable_scope('error'):
            mistakes = tf.not_equal(tf.argmax(self._target, 1), tf.argmax(self._prediction, 1))
            self._error = tf.reduce_mean(tf.cast(mistakes, dtype=tf.float32))

        tf.summary.scalar('error', self._error)

        
    def _init_optimize(self):
        tvars = tf.trainable_variables()
        grads = tf.gradients(self._cost, tvars)
        clip_grads, _ = tf.clip_by_global_norm(grads, self._max_grad_norm)
        optimizer = tf.train.AdamOptimizer(self._learning_rate)

        self._optimize = optimizer.apply_gradients(zip(clip_grads, tvars))


    def _get_batch(self, data):
        sample = data.sample(n=self._batch_size)
        x = []
        y = []
        for i in range(len(sample)):
            label = sample.iloc[i]['class']
            y.append(self._class_to_one_hot[label])

            text = sample.iloc[i]['before']
            seq = [[0] * self._embedding_size for _ in range(self._seq_len)]
            index = 0
            if len(text) > self._seq_len:
                text = text[:self._seq_len]

            for c in text:
                if c not in self._char_to_id:
                    raise Exception('unknown symbol: ', c, ' in text: ', text)
                seq[index] = self._embeddings[self._char_to_id[c]]
                index += 1
            x.append(np.array(seq))
        return (np.array(x), np.array(y))
    
    
    def train(self, log_dir, model_dir, num_steps, train, validation):
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        train_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph)
        test_writer = tf.summary.FileWriter(log_dir + '/test', sess.graph)
        summaries = tf.summary.merge_all()

        train_writer.add_graph(sess.graph)
        saver = tf.train.Saver()

        error = None
        for step in range(num_steps):
            x, y = self._get_batch(train)
            _, s = sess.run([self._optimize, summaries], feed_dict={self._input: x, self._target: y})
            train_writer.add_summary(s, step)

            x, y = self._get_batch(validation)
            s, e = sess.run([summaries, self._error], feed_dict={self._input: x, self._target: y})
            test_writer.add_summary(s, step)
            sys.stdout.write('\r{0} {1}'.format(step, e))
            if error is None or error > e:
                saver.save(sess, model_dir + '/best.chkp')
                error = e
                print 'save best model for: ', error
            

In [6]:
!rm -rf '/tmp/label_classification'

model = LabelClassifier(train)
model.train('/tmp/label_classification', './label_classifier', 10000, train[:800000], train[800000:])

vocabulary size:  3082
number of classed:  16


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


0 0.254000008106save best model for:  0.254
2 0.233999997377save best model for:  0.234
12 0.232999995351save best model for:  0.233
24 0.0939999967813save best model for:  0.094
25 0.0920000001788save best model for:  0.092
26 0.0729999989271save best model for:  0.073
51 0.070000000298save best model for:  0.07
90 0.0680000036955save best model for:  0.068
91 0.0590000003576save best model for:  0.059
132 0.0500000007451save best model for:  0.05
138 0.0430000014603save best model for:  0.043
139 0.0359999984503save best model for:  0.036
144 0.0340000018477save best model for:  0.034
163 0.0320000015199save best model for:  0.032
188 0.0299999993294save best model for:  0.03
220 0.0260000005364save best model for:  0.026
245 0.0250000003725save best model for:  0.025
254 0.0240000002086save best model for:  0.024
258 0.0189999993891save best model for:  0.019
296 0.0179999992251save best model for:  0.018
311 0.0140000004321save best model for:  0.014
377 0.00800000037998save best m

In [18]:
print u'\u7a0b' 

程


In [22]:
print model._char_to_one_hot[u'\u7a0b']


KeyError: u'\u7a0b'

In [1]:
train.iloc[616100:616120]

NameError: name 'train' is not defined