In [1]:
import vae

Using TensorFlow backend.


In [2]:
from keras.layers import Dense
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [3]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
import numpy as np

In [4]:
train = fetch_20newsgroups(subset='train')

In [5]:
test = fetch_20newsgroups(subset='test')

In [6]:
train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [7]:
pprint(list(train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [8]:
train['data'][:2]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [9]:
train['target'][:2]

array([7, 4])

In [10]:
train['target_names'][7]

'rec.autos'

In [11]:
train['target_names'][4]

'comp.sys.mac.hardware'

In [12]:
max(train['target'])

19

In [13]:
class newsnet_vae(vae.vae):
    def __init__(self, h):
        vae.vae.__init__(self, h)

    def build_auxiliary(self, encoded):
        h = Dense(100, activation='linear')(encoded)

        return Dense(20, activation='sigmoid', name='pred')(h)


In [14]:
np.min([len(x) for x in train['data']])

125

In [15]:
len(train['data'])

11314

In [16]:
len(train['target'])

11314

In [27]:
h = vae.Hyper(vocab_size=256, max_length=300, epsilon_std=1)

In [28]:
model = newsnet_vae(h)

In [19]:
def str2nparray(s):
    return np.frombuffer(bytearray(s.encode('utf-8')), dtype=np.uint8)

In [20]:
X_train = np.array([str2nparray(x) for x in train['data']])
y_train = to_categorical(train['target'], num_classes=20)

X_test = np.array([str2nparray(x) for x in test['data']])
y_test = to_categorical(test['target'], num_classes=20)


In [21]:
MAX_LENGTH = 300
NUM_WORDS = 256

X_train = pad_sequences(X_train, maxlen=MAX_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_LENGTH)


temp = np.zeros((X_train.shape[0], MAX_LENGTH, NUM_WORDS))
temp[np.expand_dims(np.arange(X_train.shape[0]), axis=0).reshape(X_train.shape[0], 1), np.repeat(np.array([np.arange(MAX_LENGTH)]), X_train.shape[0], axis=0), X_train] = 1

X_train_one_hot = temp

temp = np.zeros((X_test.shape[0], MAX_LENGTH, NUM_WORDS))
temp[np.expand_dims(np.arange(X_test.shape[0]), axis=0).reshape(X_test.shape[0], 1), np.repeat(np.array([np.arange(MAX_LENGTH)]), X_test.shape[0], axis=0), X_test] = 1

x_test_one_hot = temp

In [22]:
X_train_one_hot.shape

(11314, 300, 256)

In [23]:
X_train.shape

(11314, 300)

In [29]:
model.train(X_train, X_train_one_hot, y_train, X_test, x_test_one_hot, y_test, epochs=1)

(11314, 300)
(11314, 300, 256)
(11314, 20)
Train on 11314 samples, validate on 7532 samples
Epoch 1/1


In [31]:
model.autoencoder.evaluate(x=X_test[:3], y={'decoded_mean': x_test_one_hot[:3], 'pred': y_test[:3]})



[5.477844715118408,
 5.264688968658447,
 0.2131558656692505,
 0.1688888967037201,
 0.9500001668930054]

In [33]:
a, b = model.autoencoder.predict(X_test[:3])

In [34]:
a.shape

(3, 300, 256)

In [37]:
a.argmax(axis=2)

array([[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 

In [38]:
b.argmax(axis=-1)

array([16,  6,  6])