In [1]:
!which python

/home/mritter/anaconda3/envs/tf_gpu_test04/bin/python


In [2]:
import keras

Using TensorFlow backend.


In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())



[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5100260174652451250
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 1760165888
locality {
  bus_id: 1
}
incarnation: 5191295665076974243
physical_device_desc: "device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [4]:
# confirm TensorFlow sees the GPU
from tensorflow.python.client import device_lib
assert 'GPU' in str(device_lib.list_local_devices())

# confirm Keras sees the GPU
from keras import backend
assert len(backend.tensorflow_backend._get_available_gpus()) > 0

# confirm PyTorch sees the GPU
from torch import cuda
assert cuda.is_available()
assert cuda.device_count() > 0
print(cuda.get_device_name(cuda.current_device()))

GeForce GTX 1050


In [2]:
# Download data
# https://files.pushshift.io/hackernews/ (first file)

In [5]:
# Convert to HDF5
import bz2, json
from json import JSONDecodeError
b = bz2.BZ2File('data/HNI_2006-10.bz2')
bs = b.read()
bs[:100]

b'{"by":"pg","descendants":15,"id":1,"kids":[487171,15,234509,454410,82729],"retrieved_on":1525541947,'

In [6]:
bs.split(b'\n')[2]

b'{"by":"phyllis","descendants":0,"id":3,"kids":[454412,531602],"retrieved_on":1525541948,"score":7,"time":1160419233,"title":"Woz Interview: the early days of Apple","type":"story","url":"http:\\/\\/www.foundersatwork.com\\/stevewozniak.html"}'

In [46]:
bj = []
for l in bs.split(b'\n'):
    try:
        bj.append(json.loads(l))
    except JSONDecodeError:
        pass
bjc = [x for x in bj if x['type'] == 'comment']
bjc[:2]

[{'by': 'sama',
  'id': 15,
  'kids': [17, 454424],
  'parent': 1,
  'retrieved_on': 1525541949,
  'text': '&#34;the rising star of venture capital&#34; -unknown VC eating lunch on SHR',
  'time': 1160423461,
  'type': 'comment'},
 {'by': 'pg',
  'id': 17,
  'kids': [1079, 454426],
  'parent': 15,
  'retrieved_on': 1525541950,
  'text': 'Is there anywhere to eat on Sandhill Road?',
  'time': 1160423565,
  'type': 'comment'}]

In [66]:
# Tag
# Maybe things like links should be tagged
import re
for i in range(len(bjc)):
    bjc[i]['text'] = bjc[i]['text'].lower()
    bjc[i]['text'] = re.sub('http.*\w',' <LINK> ',bjc[i]['text'])
    bjc[i]['text'] = re.sub('\n|\r|"|\'|\?|&#34;|\.|\,','',bjc[i]['text'])

[x['text'] for x in bjc]

['the rising star of venture capital -unknown vc eating lunch on shr',
 'is there anywhere to eat on sandhill road',
 'its kind of funny that sevin rosen is giving up at the same time sequoia is scoring on this scale',
 'this is interesting but the limitations become apparent with one of their example searches:  <link> this comparison shows early stage companies offering a higher salary than fortune xxx companies however these numbers dont seem to indicate total compensation such as benefits and stock or optionsstill a cool use of search technology',
 'stay tuned',
 'im tuned',
 'winnar winnar chicken dinnar!',
 'what do you mean  this storys still not #1',
 'perhaps if i hadnt told you it was coming <link> ',
 'can you do it again',
 'its interesting how a simple set of features can make the product seem wholly different new and interesting that and the use of revolutionary and incredible everywhere =)',
 'welcome back randall']

In [67]:
# Tokenize
text_list = []
user_list = []
for (text, user) in ((x['text'], x['by']) for x in bjc):
    text_list.append(text.split())
    user_list.append(user)


In [88]:
# W2V
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
EMBEDDING_DIM = 100

w2vmodel = Word2Vec(text_list, size=EMBEDDING_DIM, window=5, min_count=1, workers=4)
w2vmodel.save("word2vec.w2vmodel")

In [89]:
w2vmodel.wv.index2word[:10]

['of', 'the', 'is', 'this', 'and', 'on', 'interesting', 'a', 'you', 'to']

In [90]:
# Index
SEQ_LEN = 10

tok_list = []
for text in text_list:
    tok_list.append([])
    for tok in text:
        try:
            tok_list[-1].append(1+w2vmodel.wv.index2word.index(tok))
        except:
            tok_list[-1].append(0)
            
    tok_list[-1] = tok_list[-1] + [0]*SEQ_LEN 
    tok_list[-1] = tok_list[-1][:SEQ_LEN] 
            
            
tok_list[5]

[84, 17, 0, 0, 0, 0, 0, 0, 0, 0]

In [91]:
import numpy as np
x_train = np.array(tok_list)
x_train

array([[  2,  22,  23,   1,  24,  25,  26,  27,  28,  29],
       [  3,  31,  32,  10,  33,   6,  34,  35,   0,   0],
       [ 11,  36,   1,  37,  12,  38,  39,   3,  40,  41],
       [  4,   3,   7,  48,   2,  49,  50,  51,  52,  53],
       [ 83,  17,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 84,  17,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 18,  18,  85,  86,   0,   0,   0,   0,   0,   0],
       [ 87,  19,   9,  88,   4,  89,  90,  91,  92,   0],
       [ 93,  94,  95,  96,  97,   9,  20,  98,  99,  13],
       [ 21,   9,  19,  20, 100,   0,   0,   0,   0,   0],
       [ 11,   7, 101,   8, 102, 103,   1, 104,  21, 105],
       [114, 115, 116,   0,   0,   0,   0,   0,   0,   0]])

In [112]:
from keras.utils import to_categorical

y_train = to_categorical([int(x == 'pg') for x in user_list])
y_train

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [125]:
# Build model
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

# embedding_layer = Embedding(len(model.wv.index2word) + 1,
#                             EMBEDDING_DIM,
#                             weights=[embedding_matrix],
#                             input_length=MAX_SEQUENCE_LENGTH,
#                             trainable=False)

# ValueError: The shape of the input to "Flatten" is not fully defined (got (None, 100). 
# Make sure to pass a complete "input_shape" or "batch_input_shape" argument to the first layer in your model.


model = Sequential()
# embedding_layer = w2vmodel.wv.get_keras_embedding()
# model.add(embedding_layer(EMBEDDING_DIM, input_dim=SEQ_LEN))
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be
# no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

model.add(Flatten())#input_dim=EMBEDDING_DIM*SEQ_LEN))
model.add(Dense(units=5, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

In [126]:
# Train Model
model.fit(x_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f03f7eac5f8>

In [127]:
# Test Model
predictions = model.predict(x_train)
print(predictions)
loss_and_metrics = model.evaluate(x_train, y_train, batch_size=1)
print(y_train.mean(axis=0))
print(loss_and_metrics)

[[0.49647155 0.5035285 ]
 [0.49406293 0.50593704]
 [0.4794362  0.52056384]
 [0.50747555 0.49252445]
 [0.49582514 0.5041748 ]
 [0.50079596 0.49920407]
 [0.48476064 0.51523936]
 [0.51129705 0.48870295]
 [0.54202676 0.4579732 ]
 [0.46779948 0.5322006 ]
 [0.50372565 0.49627435]
 [0.4822092  0.51779085]]
[0.5833333  0.41666666]
[0.6839474886655807, 0.5]


In [10]:
# New Data