In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # so the IDs match nvidia-smi
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # "0, 1" for multiple

# Load the data

In [1]:
! ls

1_Download_and_Preprocess.ipynb		    body_pp.dpkl
2_Build_Model.ipynb			    labeled_issues_df.pkl
Demo.ipynb				    test_body_vecs.npy
IssueLabeler.epoch01-val0.84455.hdf5	    test_labels.npy
IssueLabeler.epoch01-val0.86108.hdf5	    test_title_vecs.npy
IssueLabeler.epoch02-val0.84564.hdf5	    testdf.pkl
IssueLabeler.epoch03-val0.83802.hdf5	    title_pp.dpkl
IssueLabeler.log			    train_body_vecs.npy
IssueLabeler_nbow_.epoch01-val0.89053.hdf5  train_labels.npy
IssueLabeler_nbow_.epoch02-val0.87182.hdf5  train_title_vecs.npy
IssueLabeler_nbow_.log			    traindf.pkl


In [2]:
import numpy as np
import dill as dpickle

In [3]:
def load_pickle(fname):
    "load file pickled with dill."
    with open(fname, 'rb') as f:
        pp = dpickle.load(f)
    return pp

#load the text pre-processors
title_pp = load_pickle('title_pp.dpkl')
body_pp = load_pickle('body_pp.dpkl')

#load the training data and labels
train_body_vecs = np.load('train_body_vecs.npy')
train_title_vecs = np.load('train_title_vecs.npy')
train_labels = np.load('train_labels.npy')

#load the test data and labels
test_body_vecs = np.load('test_body_vecs.npy')
test_title_vecs = np.load('test_title_vecs.npy')
test_labels = np.load('test_labels.npy')

Using TensorFlow backend.


### Build Model Architecture

In [4]:
import tensorflow as tf
from tensorflow.keras.utils import multi_gpu_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Conv1D, Bidirectional, BatchNormalization, Dot, Flatten, Concatenate
from tensorflow.keras.optimizers import Nadam

In [5]:
tf.__version__

'1.12.0'

In [6]:
issue_body_doc_length = train_body_vecs.shape[1]
issue_title_doc_length = train_title_vecs.shape[1]

body_vocab_size = body_pp.n_tokens
title_vocab_size = title_pp.n_tokens

body_emb_size = 400
title_emb_size = 300

num_classes = len(set(train_labels))

In [7]:
body_input = Input(shape=(issue_body_doc_length,), name='Body-Input')
title_input = Input(shape=(issue_title_doc_length,), name='Title-Input')

b_i = Embedding(body_vocab_size, body_emb_size, name='Body-Embedding', mask_zero=False)(body_input)
b_t = Embedding(title_vocab_size, title_emb_size, name='Title-Embedding', mask_zero=False)(title_input)

b_i = BatchNormalization()(b_i)
b_i = Bidirectional(GRU(300, name='Body-Encoder'))(b_i)

b_t = BatchNormalization()(b_t)
b_t = GRU(300, name='Title-Encoder')(b_t)

b = Concatenate(name='Concat')([b_i, b_t])
#b = Dense(100, activation='relu', name='Dense1')(b_concat)
b = BatchNormalization()(b)
out = Dense(num_classes, activation='softmax')(b)

model = Model([body_input, title_input], out)
parallel_model = multi_gpu_model(model, gpus=4)
parallel_model.compile(optimizer=Nadam(lr=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [8]:
parallel_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Body-Input (InputLayer)         (None, 140)          0                                            
__________________________________________________________________________________________________
Title-Input (InputLayer)        (None, 10)           0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 140)          0           Body-Input[0][0]                 
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 10)           0           Title-Input[0][0]                
__________________________________________________________________________________________________
lambda_2 (

## Train Model

In [9]:
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'IssueLabeler'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 6400
epochs = 10
history = parallel_model.fit(x=[train_body_vecs, train_title_vecs], 
                             y=np.expand_dims(train_labels, -1),
                             batch_size=batch_size,
                             epochs=epochs,
                             validation_split=0.10, 
                             callbacks=[csv_logger, model_checkpoint])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4486247 samples, validate on 498472 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Try another approach: Nueral BoW

In [9]:
body_input2 = Input(shape=(issue_body_doc_length,), name='Body-Input')
title_input2 = Input(shape=(issue_title_doc_length,), name='Title-Input')

b_i2 = Embedding(body_vocab_size, body_emb_size//2, name='Body-Embedding', mask_zero=False)(body_input2)
b_t2 = Embedding(title_vocab_size, title_emb_size//2, name='Title-Embedding', mask_zero=False)(title_input2)

token_weights_i = Dense(units=1, activation='sigmoid', use_bias=False)(b_i2)
token_weights_t = Dense(units=1, activation='sigmoid', use_bias=False)(b_t2)

i_emb = Dot(axes=1, normalize=True, name='body_encoding')([b_i2, token_weights_i])
i = Flatten()(i_emb)
t_emb = Dot(axes=1, normalize=True, name='title_encoding')([b_t2, token_weights_t])
t = Flatten()(t_emb)

b = Concatenate(name='Concat')([i, t])
b = Dense(100, activation='relu', name='Dense1')(b)
b = BatchNormalization()(b)
out = Dense(num_classes, activation='softmax')(b)

model2 = Model([body_input2, title_input2], out)
parallel_model_nbow = multi_gpu_model(model2, gpus=4)
parallel_model_nbow.compile(optimizer=Nadam(lr=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [10]:
model2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Body-Input (InputLayer)         (None, 140)          0                                            
__________________________________________________________________________________________________
Title-Input (InputLayer)        (None, 10)           0                                            
__________________________________________________________________________________________________
Body-Embedding (Embedding)      (None, 140, 200)     1600200     Body-Input[0][0]                 
__________________________________________________________________________________________________
Title-Embedding (Embedding)     (None, 10, 150)      750150      Title-Input[0][0]                
__________________________________________________________________________________________________
dense_3 (D

In [11]:
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'IssueLabeler_nbow_'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 20000
epochs = 10
history = parallel_model_nbow.fit(x=[train_body_vecs, train_title_vecs], 
                             y=np.expand_dims(train_labels, -1),
                             batch_size=batch_size,
                             epochs=epochs,
                             validation_split=0.10, 
                             callbacks=[csv_logger, model_checkpoint])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4486247 samples, validate on 498472 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Without Bi-Directional

In [16]:
body_input = Input(shape=(issue_body_doc_length,), name='Body-Input')
title_input = Input(shape=(issue_title_doc_length,), name='Title-Input')

b_i = Embedding(body_vocab_size, body_emb_size//2, name='Body-Embedding', mask_zero=False)(body_input)
b_t = Embedding(title_vocab_size, title_emb_size//2, name='Title-Embedding', mask_zero=False)(title_input)

b_i = BatchNormalization()(b_i)
b_i = GRU(300, name='Body-Encoder')(b_i)

b_t = BatchNormalization()(b_t)
b_t = GRU(300, name='Title-Encoder')(b_t)

b = Concatenate(name='Concat')([b_i, b_t])
#b = Dense(100, activation='relu', name='Dense1')(b_concat)
b = BatchNormalization()(b)
out = Dense(num_classes, activation='softmax')(b)

model = Model([body_input, title_input], out)
parallel_model = multi_gpu_model(model, gpus=4)
parallel_model.compile(optimizer=Nadam(lr=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'IssueLabeler_nobd'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 2400
epochs = 10
history = parallel_model.fit(x=[train_body_vecs, train_title_vecs], 
                             y=np.expand_dims(train_labels, -1),
                             batch_size=batch_size,
                             epochs=epochs,
                             validation_split=0.10, 
                             callbacks=[csv_logger, model_checkpoint])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4486247 samples, validate on 498472 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
 153600/4486247 [>.............................] - ETA: 19:11 - loss: 0.7874 - acc: 0.6451

In [14]:
body_emb_size

400

In [15]:
title_emb_size

300