In [10]:
import numpy as np
import h5py
import json
import os
import pandas as pd

import tensorflow as tf
from tensorflow.keras.utils import multi_gpu_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Conv1D, Bidirectional, BatchNormalization, Dot, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint
import wandb
from wandb.keras import WandbCallback

In [2]:
dataset = h5py.File('data/dataset.hdf5', 'r')
with open("data/metadata.json", "r") as f:
    meta = json.loads(f.read())
    
train_body_vecs, train_title_vecs, train_labels = (dataset['bodies'], 
                                                   dataset['titles'], 
                                                   dataset['targets'])

test_body_vecs, test_title_vecs, test_labels = (dataset['test_bodies'], 
                                                dataset['test_titles'], 
                                                dataset['test_targets'])

assert train_body_vecs.shape[0] == train_title_vecs.shape[0] == train_labels.shape[0]
assert test_body_vecs.shape[0] == test_title_vecs.shape[0] == test_labels.shape[0]

In [3]:
input_dir = "data/"
out_dir = "output/"

In [4]:
body_emb_size = 50
title_emb_size = 50
batch_size = 900
epochs = 4

body_input = Input(shape=(meta['issue_body_doc_length'],), name='Body-Input')
title_input = Input(shape=(meta['issue_title_doc_length'],), name='Title-Input')

body = Embedding(meta['body_vocab_size'], body_emb_size, name='Body-Embedding')(body_input)
title = Embedding(meta['title_vocab_size'], title_emb_size, name='Title-Embedding')(title_input)

body = BatchNormalization()(body)
body = GRU(100, name='Body-Encoder')(body)

title = BatchNormalization()(title)
title = GRU(75, name='Title-Encoder')(title)

x = Concatenate(name='Concat')([body, title])
x = BatchNormalization()(x)
out = Dense(meta['num_classes'], activation='softmax')(x)

model = Model([body_input, title_input], out)

model.compile(optimizer=Adam(lr=0.001), 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

script_name_base = 'Issue_Labeler'
csv_logger = CSVLogger(out_dir + '{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint(out_dir + '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)


In [6]:
history = model.fit(x=[train_body_vecs[:10, ], train_title_vecs[:10, ]], 
                    y=train_labels[:10, ],
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=[(test_body_vecs[:10, ], test_title_vecs[:10, ]), test_labels[:10, ]], 
                    callbacks=[csv_logger, model_checkpoint])

Train on 10 samples, validate on 10 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [15]:
pd.DataFrame(history.history).val_loss.min()


0.30000001192092896

In [16]:
pd.DataFrame(history.history).val_accuracy.min()

0.30000001192092896