# Confirm environment
GPUs can get lost when computer goes to sleep, requires restart
Make sure that you're in a conda environment that supports Keras and has tensorflow-gpu installed

potentially try: `alias gpureload="sudo rmmod nvidia_uvm ; sudo modprobe nvidia_uvm"`

In [1]:
!which python

/home/mritter/anaconda3/envs/tf_gpu_test04/bin/python


In [2]:
! conda list tensorflow-gpu 

# packages in environment at /home/mritter/anaconda3/envs/tf_gpu_test04:
#
# Name                    Version                   Build  Channel
tensorflow-gpu            1.5.0                         0  
tensorflow-gpu-base       1.5.0            py36h8a131e3_0  


In [None]:
# IF IT DOES NOT WORK, MAY NEED TO RESTART COMPUTER

# confirm TensorFlow sees the GPU
from tensorflow.python.client import device_lib
assert 'GPU' in str(device_lib.list_local_devices())

# confirm Keras sees the GPU
from keras import backend
assert len(backend.tensorflow_backend._get_available_gpus()) > 0

# confirm PyTorch sees the GPU
from torch import cuda
assert cuda.is_available()
assert cuda.device_count() > 0
print(cuda.get_device_name(cuda.current_device()))

# Load Data

In [4]:
BASE_URL = 'https://files.pushshift.io/hackernews/'

In [5]:
from requests import get  # to make GET request


def download(url, file_name):
    # open in binary mode
    with open(file_name, "wb") as file:
        # get request
        response = get(url)
        # write to file
        file.write(response.content)

download(BASE_URL+'HNI_total_items_by_month.txt', 'data/manifest.txt')

In [6]:
! head data/manifest.txt

        61 HNI_2006-10
         1 HNI_2006-12
      1549 HNI_2007-02
      6305 HNI_2007-03
     10335 HNI_2007-04
      7516 HNI_2007-05
      6036 HNI_2007-06
      6410 HNI_2007-07
     10841 HNI_2007-08
     12371 HNI_2007-09


In [7]:
import pandas as pd
with open('data/manifest.txt', 'r') as f:
    manifest = pd.Series(f.read().split('\n')).map(str.split)
    manifest = manifest[manifest.map(len) > 0].reset_index(drop=True)
manifest.tail()

134    [209738, HNI_2018-02]
135    [237342, HNI_2018-03]
136    [237609, HNI_2018-04]
137    [237646, HNI_2018-05]
138        [17172781, total]
dtype: object

In [8]:
# bulk_file_download.py
import os.path

assert 0, 'DOWNLOADING TAKES A LONG TIME'

url_format = BASE_DIR+'{}.bz2'
file_format = 'data/{}.bz2'
for size, filename in manifest:
    if os.path.isfile(file_format.format(filename)): continue 
    download(url_format.format(filename), file_format.format(filename))
    

AssertionError: DOWNLOADING TAKES A LONG TIME

In [7]:
! ls -lah data/*bz2 | tail -3

-rw-rw-r-- 1 mritter mritter   27M Jan 19 17:50 data/HNI_2016-11.bz2
-rw-rw-r-- 1 mritter mritter   26M Jan 19 17:50 data/HNI_2016-12.bz2
-rw-rw-r-- 1 mritter mritter   29M Jan 20 16:13 data/HNI_2017-01.bz2


# Preprocess

In [8]:
# preprocess.py
import dask.bag as db
from dask.distributed import Client, progress
import json, re
client = Client(n_workers=8, threads_per_worker=2, memory_limit='6GB')

def comment_filter(record):
    return (record['type'] == 'comment' 
            and record.get('deleted', None) == None
            and record.get('text', None) != None)

def text_transformation(record):
    text = record['text'].lower()
    text = re.sub('http.*\w',' <LINK> ',text)
    un = record['by'].lower()
    return (un, text)

b = db.read_text('data/*bz2').map(json.loads).filter(comment_filter)\
      .map(text_transformation)

In [11]:
%%time
from IPython.display import HTML, display

for row in b.random_sample(.0000001):  # 2m
    display(HTML(row[1]))
    print('--')

--


--


--
CPU times: user 10.8 s, sys: 1.7 s, total: 12.5 s
Wall time: 1min 45s


In [12]:
%%time
b.count().compute()  # 2m

CPU times: user 9.07 s, sys: 1.8 s, total: 10.9 s
Wall time: 1min 47s


10763434

# Format for use

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

MAX_SEQUENCE_LENGTH = 500
SAMPLE_LENGTH = 50000

Using TensorFlow backend.


In [11]:
%%time
non_target_texts = b.map(lambda x: x[1]).take(SAMPLE_LENGTH, npartitions=-1)

CPU times: user 4.77 s, sys: 755 ms, total: 5.53 s
Wall time: 1min 7s


In [12]:
%%time
target_texts = b.filter(lambda x: x[0] == 'patio11').map(lambda x: x[1]).compute()

CPU times: user 8.47 s, sys: 1.34 s, total: 9.82 s
Wall time: 1min 48s


In [13]:
type(tuple(target_texts))

tuple

In [14]:
texts = non_target_texts + tuple(target_texts)

In [15]:
# tokenize.py
%%time
tokenizer = Tokenizer(num_words=MAX_SEQUENCE_LENGTH)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray([0]*len(non_target_texts) + [1]*len(target_texts)))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 57663 unique tokens.
Shape of data tensor: (59472, 500)
Shape of label tensor: (59472, 2)
CPU times: user 5.3 s, sys: 104 ms, total: 5.4 s
Wall time: 5.16 s


In [33]:
%%time
import h5py

with h5py.File('data/padded_data.h5', 'w') as h5f:
    h5f.create_dataset('dataset_1', data=data)

h5f.close()

CPU times: user 0 ns, sys: 64 ms, total: 64 ms
Wall time: 103 ms


In [34]:
%%time
import h5py

with h5py.File('data/labels.h5', 'w') as h5f:
    h5f.create_dataset('dataset_1', data=labels)

h5f.close()

CPU times: user 3.81 ms, sys: 0 ns, total: 3.81 ms
Wall time: 3.6 ms


In [35]:
# split the data into a training set and a validation set
VALIDATION_SPLIT = 0.2

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]


In [38]:
x_train.shape

(47578, 500)

In [36]:
x_train[:5]

array([[  0,   0,   0, ...,  17,   9,  20],
       [  0,   0,   0, ..., 135,   2,  10],
       [  0,   0,   0, ..., 104,  34, 118],
       [  0,   0,   0, ...,  23,   3,   5],
       [  0,   0,   0, ..., 375,  14,  64]], dtype=int32)

In [37]:
y_train[:5]

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [39]:
# create_embedding_matrix.py
%%time
# This is actually super fast
# first, build index mapping words in the embeddings set
# to their embedding vector
import os 
BASE_DIR = '/home/mritter/code/twitter_nlp/newsgroups_data/'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove')

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.
CPU times: user 10.2 s, sys: 632 ms, total: 10.8 s
Wall time: 10.1 s


In [40]:
len(tokenizer.word_index)

57663

In [41]:
%%time 
# prepare embedding matrix

num_distinct_words = len(tokenizer.word_index) + 1  # For <UNKNOWN> 
EMBEDDING_DIM = 100  # Dimensions to represent each token

embedding_matrix = np.zeros((num_distinct_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > num_distinct_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

CPU times: user 74.2 ms, sys: 11.9 ms, total: 86.1 ms
Wall time: 85.7 ms


In [42]:
embedding_matrix.shape

(57664, 100)

In [83]:
%%time
import h5py

with h5py.File('data/whole_data.h5', 'w') as h5f:
    h5f.create_dataset('embedding_matrix', data=embedding_matrix)
    h5f.create_dataset('x_train', data=x_train)
    h5f.create_dataset('y_train', data=y_train)
    h5f.create_dataset('x_val', data=x_val)
    h5f.create_dataset('y_val', data=y_val)

h5f

CPU times: user 0 ns, sys: 87.5 ms, total: 87.5 ms
Wall time: 85.5 ms


In [43]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant

embedding_layer = Embedding(num_distinct_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [44]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


In [60]:
from datetime import datetime
t = datetime.now()
'{:%Y-%m-%d-%H-%M}'.format(t)

'20190121-15-09'

In [84]:
# Create a TensorBoard instance with the path to the logs directory
from time import time
from keras.callbacks import TensorBoard as tb
from datetime import datetime
t = datetime.now()
tensorboard = tb(log_dir='tensorboard_logs/{:%Y-%m-%d-%H-%M}'.format(t))

model.fit(x_train, y_train,
          batch_size=64, #128,
          epochs=2,
          validation_data=(x_val, y_val),
          callbacks=[tensorboard])


Train on 47578 samples, validate on 11894 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f90b1469470>

In [85]:
import keras.backend as K
K.clear_session() 

In [62]:
x_train.shape

(47578, 500)

In [68]:
# Get a feel for the outputs
import pandas as pd


samples = 10000

df = pd.DataFrame({
    'prob_patio': model.predict(x_train[:samples])[:, 1].round(2),
    'original_index': indices[:samples],
    'original_text': [texts[i] for i in indices[:samples]],
    'is_patio': labels[:samples,1].astype(int),
})
most_similar = df[df.is_patio == 0].sort_values('prob_patio', ascending=False).reset_index()
most_similar.head()
# loss_and_metrics = model.evaluate(x_train, y_train, batch_size=1)
# print(y_train.mean(axis=0))
# print(loss_and_metrics)

Unnamed: 0,index,prob_patio,original_index,original_text,is_patio
0,8147,0.82,30330,rtfa. the blog post lists a 2-3 word blurb for...,0
1,5780,0.76,28511,"you can get most of that info elsewhere, thoug...",0
2,6093,0.76,7990,"actual karma is defined as ""the total effect o...",0
3,8374,0.74,26211,"ah yes, but that is to buy military products w...",0
4,3322,0.74,16147,low millions probably means low ones-of-millio...,0


In [69]:
i = most_similar.iterrows()

In [82]:
display(HTML((next(i)[1].original_text)))