In [1]:
import re
import numpy as np
import tensorflow as tf
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

# Dataset load
categories = ["rec.motorcycles","sci.space","sci.electronics"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

# Remove all email`s
for i in range(len(newsgroups_train.data)):
    newsgroups_train.data[i] = re.sub(r'\w+@\w+.\w+.*\w+','',newsgroups_train.data[i])
for i in range(len(newsgroups_test.data)):
    newsgroups_test.data[i] = re.sub(r'\w+@\w+.\w+.*\w+','',newsgroups_test.data[i])
    
# Clear punctuation, remove all unread symbols
for i in range(len(newsgroups_train.data)):
    newsgroups_train.data[i] = re.sub(r'[^\w\s]','',newsgroups_train.data[i])   
for i in range(len(newsgroups_test.data)):
    newsgroups_test.data[i] = re.sub(r'[^\w\s]','',newsgroups_test.data[i])

# Remove \n \t \r
for i in range(len(newsgroups_train.data)):
    newsgroups_train.data[i] = re.sub(r'^|\n|\r|\t','',newsgroups_train.data[i])
for i in range(len(newsgroups_test.data)):
    newsgroups_test.data[i] = re.sub(r'^|\n|\r|\t','',newsgroups_test.data[i])

# Words counter [some word]:[number of inclusions]    
vocab = Counter()
for text in newsgroups_train.data + newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

total_words = len(vocab)
print('Total words in vocabulary:{}\n'.format(total_words))

# Map of words [some word]:[index]
word_map = {}
for i, word in enumerate(vocab):
    word_map[word.lower()] = i

# Возвращает массивы активированных рецепторов и их правильные результаты
# Размер слоя рецепторов равен количеству слов в словаре. Слово = рецептор.
# Значение рецептора - количество вхождений слова в текст
def get_batch(df, step, size):
    batches = []
    results = []
    texts = df.data[step * size: step * size + size]
    categories = df.target[step * size: step * size + size]
    for text in texts:
        layer = np.zeros(total_words,dtype=float)
        for word in text.split(' '):
            layer[word_map[word.lower()]] = 1           
        batches.append(layer)

    for category in categories:
        y = np.zeros((3),dtype=float)
        if category == 0:
            y[0] = 1.
        elif category == 1:
            y[1] = 1.
        else:
            y[2] = 1.
        results.append(y)
    return np.array(batches),np.array(results)


Total words in vocabulary:63518



In [3]:
# Parameters
learning_rate = 0.01
training_epochs = 10
batch_size = 150
display_step = True

# Network Parameters
n_hidden_1 = 60      # 1st layer number of features
n_hidden_2 = 20      # 2nd layer number of features
n_input = total_words # Words in vocab
n_classes = 3         # Number of categories

input_tensor = tf.placeholder(tf.float32,[None, n_input],name="input")
output_tensor = tf.placeholder(tf.float32,[None, n_classes],name="output") 

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1], stddev=0.01)),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2], stddev=0.1)),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes], stddev=0.1))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1], stddev=0.1)),
    'b2': tf.Variable(tf.random_normal([n_hidden_2], stddev=0.1)),
    'out': tf.Variable(tf.random_normal([n_classes], stddev=0.1))
}

# Construct model
layer_1_multiplication = tf.matmul(input_tensor, weights['h1'])
layer_1_addition = tf.add(layer_1_multiplication, biases['b1'])
layer_1 = tf.nn.relu(layer_1_addition)
    
# Hidden layer with RELU activation
layer_2_multiplication = tf.matmul(layer_1, weights['h2'])
layer_2_addition = tf.add(layer_2_multiplication, biases['b2'])
layer_2 = tf.nn.relu(layer_2_addition)
    
# Output layer 
out_layer_multiplication = tf.matmul(layer_2, weights['out'])
out_layer_addition = out_layer_multiplication + biases['out']

prediction = out_layer_addition

# Define loss and optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=output_tensor))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# Initializing the variables
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(len(newsgroups_train.data)/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
            # Run optimization op (backprop) and cost op (to get loss value)
            c,_ = sess.run([loss,optimizer], feed_dict={input_tensor: batch_x,output_tensor:batch_y})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if display_step == True:
            print("Epoch:", '%04d' % (epoch+1), "loss=", \
                "{:.9f}".format(avg_cost))
    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(output_tensor, 1))
    
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    total_test_data = len(newsgroups_test.target)
    batch_x_test,batch_y_test = get_batch(newsgroups_test,0,total_test_data)
    print("Accuracy:", accuracy.eval({input_tensor: batch_x_test, output_tensor: batch_y_test}))
    


Epoch: 0001 loss= 0.591504782
Epoch: 0002 loss= 0.008712730
Epoch: 0003 loss= 0.000217346
Epoch: 0004 loss= 0.000028637
Epoch: 0005 loss= 0.000015331
Epoch: 0006 loss= 0.000011890
Epoch: 0007 loss= 0.000010453
Epoch: 0008 loss= 0.000009639
Epoch: 0009 loss= 0.000009036
Epoch: 0010 loss= 0.000008523
Optimization Finished!
Accuracy: 0.948523
