In [1]:
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import  TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', data_home='../Data/')
newsgroups_test = fetch_20newsgroups(subset='test', data_home='../Data/')

vectortype_train = TfidfVectorizer(stop_words='english')
vectortype_test = TfidfVectorizer(stop_words='english')

news_vectored_result_train = vectortype_train.fit_transform(newsgroups_train.data)
news_vectored_result_test = vectortype_test.fit_transform(newsgroups_test.data)

news_train_target = newsgroups_train.target
news_train_target_names = newsgroups_train.target_names
news_test_target = newsgroups_test.target


train_features = vectortype_train.get_feature_names()
test_features = vectortype_test.get_feature_names()
common_features = np.intersect1d(train_features, test_features)

train_feature_final = np.searchsorted(train_features, common_features)
news_vt_train = news_vectored_result_train[:,train_feature_final]

test_feature_final = np.searchsorted(test_features, common_features)
news_vt_test = news_vectored_result_test[:,test_feature_final]

news_vt_train = np.array(news_vt_train.todense())
news_vt_test = np.array(news_vt_test.todense())

In [3]:
col_size = news_vt_train.shape[1]
class_size = len(np.unique(news_train_target))
print(col_size, class_size)

49600 20


In [4]:
learning_rate = 0.1
num_steps = 300
batch_size = 128
display_step = 30

# Network Parameters
n_hidden_1 = 300 # 1st layer number of neurons
n_hidden_2 = 300 # 2nd layer number of neurons
num_input = col_size # MNIST data input (img shape: 28*28)
num_classes = class_size # MNIST total classes (0-9 digits)

# tf Graph input
X = tf.placeholder("float", [None, num_input])
Y = tf.placeholder("float", [None, num_classes])

weights = {
'h1': tf.Variable(tf.random_normal([num_input, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_2, num_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'b2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([num_classes]))
}

In [5]:
def neural_net(x):
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

In [8]:
# Construct model
logits = neural_net(X)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [9]:
def shuffle(matrix1, matrix2):
    index = np.arange(np.shape(matrix1)[0])
    np.random.shuffle(index)
    return matrix1[index, :], matrix2[index]

In [10]:
def getBatch(data, old, batch_size):
        if old+batch_size <= data.shape[0]:
            batch_x = data[old:old+batch_size]
            y = target[old:old+batch_size]
            #print(batch_x.shape, old, old + batch_size)
            old += batch_size
            batch_y = []
            for item in y:
                temp = np.zeros((20),dtype=float)
                temp[item] = 1
                batch_y.append(temp)
            return batch_x, np.array(batch_y), old
        else:
            batch_x = data[old:old+batch_size]
            y = target[old:old+batch_size]
            t_batch_size = batch_size - batch_x.shape[0]
            old = 0
            new_batch_x = data[old:t_batch_size]
            new_y = target[old:t_batch_size]
            batch_x = np.concatenate((batch_x, new_batch_x), axis=0)
            y = np.concatenate((y, new_y), axis=0)
            #print('\n\n', batch_x.shape, old, old +t_batch_size)
            old += t_batch_size
            batch_y = []
            for item in y:
                temp = np.zeros((20),dtype=float)
                temp[item] = 1
                batch_y.append(temp)
            return batch_x, np.array(batch_y), old

In [11]:
data, target = shuffle(news_vt_train, news_train_target)

In [12]:
# Start training
with tf.Session() as sess:
    # Run the initializer
    print('A')
    sess.run(init)
    print('B')
    old=0
    for step in range(1, num_steps+1):
        batch_x, batch_y, old = getBatch(data, old, batch_size)
        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x,
                                                                 Y: batch_y})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")
    # Calculate accuracy for MNIST test images
    batch_y = []
    for item in news_test_target:
        temp = np.zeros((20),dtype=float)
        temp[item] = 1
        batch_y.append(temp)
    test_target_batch = np.array(batch_y)
    print("Testing Accuracy:", 
        sess.run(accuracy, feed_dict={X: news_vt_test,
                                      Y: test_target_batch}))

A
B
Step 1, Minibatch Loss= 736.3987, Training Accuracy= 0.570
Step 30, Minibatch Loss= 86.0449, Training Accuracy= 0.906
Step 60, Minibatch Loss= 35.8812, Training Accuracy= 0.977
Step 90, Minibatch Loss= 7.6157, Training Accuracy= 0.984
Step 120, Minibatch Loss= 0.0000, Training Accuracy= 1.000
Step 150, Minibatch Loss= 0.5971, Training Accuracy= 0.992
Step 180, Minibatch Loss= 2.5590, Training Accuracy= 0.992
Step 210, Minibatch Loss= 5.4527, Training Accuracy= 0.984
Step 240, Minibatch Loss= 11.5293, Training Accuracy= 0.992
Step 270, Minibatch Loss= 2.8895, Training Accuracy= 0.992
Step 300, Minibatch Loss= 5.2950, Training Accuracy= 0.984
Optimization Finished!
(128, 20)
(7532, 20)
Testing Accuracy: 0.746017
