# Load the data

In [1]:
! ls

1_Download_and_Preprocess.ipynb  labeled_issues_df.pkl	title_pp.dpkl
2_Build_Model.ipynb		 test_body_vecs.npy	train_body_vecs.npy
Demo.ipynb			 test_labels.npy	train_labels.npy
IssueLabeler.log		 test_title_vecs.npy	train_title_vecs.npy
body_pp.dpkl			 testdf.pkl		traindf.pkl


## Use Just 1 GPU

Michal You May want to turn this off

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # so the IDs match nvidia-smi
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # "0, 1" for multiple

In [3]:
import numpy as np
import dill as dpickle

In [4]:
def load_pickle(fname):
    "load file pickled with dill."
    with open(fname, 'rb') as f:
        pp = dpickle.load(f)
    return pp

#load the text pre-processors
title_pp = load_pickle('title_pp.dpkl')
body_pp = load_pickle('body_pp.dpkl')

#load the training data and labels
train_body_vecs = np.load('train_body_vecs.npy')
train_title_vecs = np.load('train_title_vecs.npy')
train_labels = np.load('train_labels.npy')

#load the test data and labels
test_body_vecs = np.load('test_body_vecs.npy')
test_title_vecs = np.load('test_title_vecs.npy')
test_labels = np.load('test_labels.npy')

Using TensorFlow backend.


### Build Model Architecture

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization, Concatenate
from tensorflow.keras.optimizers import Nadam

In [6]:
tf.__version__

'1.12.0'

In [7]:
issue_body_doc_length = train_body_vecs.shape[1]
issue_title_doc_length = train_title_vecs.shape[1]

body_vocab_size = body_pp.n_tokens
title_vocab_size = title_pp.n_tokens

body_emb_size = 600
title_emb_size = 400

num_classes = len(set(train_labels))

In [8]:
body_input = Input(shape=(issue_body_doc_length,), name='Body-Input')
title_input = Input(shape=(issue_title_doc_length,), name='Title-Input')

b_i = Embedding(body_vocab_size, body_emb_size, name='Body-Embedding', mask_zero=False)(body_input)
b_t = Embedding(title_vocab_size, title_emb_size, name='Title-Embedding', mask_zero=False)(title_input)

b_i = BatchNormalization()(b_i)
b_i = GRU(500, name='Body-Encoder')(b_i)

b_t = BatchNormalization()(b_t)
b_t = GRU(500, name='Title-Encoder')(b_t)

b_concat = Concatenate(name='Concat')([b_i, b_t])
b_dense = Dense(100, activation='relu', name='Dense1')(b_concat)
b_dense = BatchNormalization()(b_dense)
out = Dense(num_classes, activation='softmax')(b_dense)

model = Model([body_input, title_input], out)
model.compile(optimizer=Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

In [9]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Body-Input (InputLayer)         (None, 140)          0                                            
__________________________________________________________________________________________________
Title-Input (InputLayer)        (None, 10)           0                                            
__________________________________________________________________________________________________
Body-Embedding (Embedding)      (None, 140, 600)     4800600     Body-Input[0][0]                 
__________________________________________________________________________________________________
Title-Embedding (Embedding)     (None, 10, 400)      2000400     Title-Input[0][0]                
__________________________________________________________________________________________________
batch_norm

## Train Model

In [None]:
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'IssueLabeler'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 500
epochs = 10
history = model.fit(x=[train_body_vecs, train_title_vecs], 
                    y=np.expand_dims(train_labels, -1),
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.10, 
                    callbacks=[csv_logger, model_checkpoint])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4486247 samples, validate on 498472 samples
Epoch 1/10
 174500/4486247 [>.............................] - ETA: 46:45 - loss: 1.0130