This notebook provides possibility to:
 - download bugs data from mozilla
 - perform data munging on downloaded data and create datasets with different parameters
 - create, train neural network model to classify bugs by product - component labels
 - test neural networks on separated test data.
 - automatically search best hyperparameters for neural network models.
 - save information about models to history file

In [None]:
import importlib
import time
import sys
import random
import math
import os
import io
import gc
import re
import datetime as dt
import numpy as np
import scipy as sp
import pandas as pd
import requests
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import reduce
import json
import collections
from time import strftime
import sklearn

#import concurrent.futures
#from requests_futures.sessions import FuturesSession

from bs4 import BeautifulSoup

os.environ['KERAS_BACKEND']='tensorflow'
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Model
from keras.models import model_from_json
from keras.layers import Dense, Input, Flatten, Average, SpatialDropout1D, LeakyReLU
from keras.layers import AveragePooling1D, GlobalAveragePooling1D, GlobalMaxPool1D, MaxPooling1D
from keras.layers import Conv1D, Embedding, Merge, Dropout, Activation
from keras.layers import BatchNormalization, Concatenate
import keras.callbacks
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

from keras import backend as K

import sklearn.metrics
from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec



First step - define global directories constants


In [None]:
DATA_DIR_C = os.path.join(os.getcwd(), 'data')
MODELS_DIR_C = os.path.join(os.getcwd(), 'models')
LOGS_DIR_C = os.path.join(os.getcwd(), 'logs')
if not os.path.exists(DATA_DIR_C):
    os.makedirs(DATA_DIR_C)
if not os.path.exists(MODELS_DIR_C):
    os.makedirs(MODELS_DIR_C)
if not os.path.exists(LOGS_DIR_C):
    os.makedirs(LOGS_DIR_C)


Next step - define functions for downloading data from bugzilla.


In [None]:

#https://bugzilla.mozilla.org/buglist.cgi?query_format=advanced&resolution=FIXED&resolution=INVALID&ctype=csv
def downloadExtendedData(data_frame):
    commentUrls = ['https://bugzilla.mozilla.org' + '/rest/bug/' + str(bugId) + '/comment' for bugId in data_frame.bug_id]

    async def downloadUrls(sUrls, data_frame):
       # with FuturesSession(max_workers=20) as session:
        remainingUrls = sUrls
        with ThreadPoolExecutor(max_workers=100) as executor:
            while len(remainingUrls) > 0:
                loop = asyncio.get_event_loop()
                futures = [loop.run_in_executor(executor, requests.get, sUrl) for sUrl in remainingUrls]
                print('futures count = ', len(futures))
                rCount = 0
                remainingUrlsIndices = []
                for response in await asyncio.gather(*futures, return_exceptions=True):
                    if isinstance(response, Exception):
                        remainingUrlsIndices.append(rCount)
                    else:
                        if len(response.text) > 0:
                            try:
                                jData = response.json()
                                for sId, obj in jData['bugs'].items():
                                    data_frame.set_value(int(sId), 'description', obj['comments'][0]['text'])
                            except Exception as exc:
                                print('incorrect json response or problem with data_frame. Response = ', response)
                    rCount += 1
                print('gathered count = ', rCount - len(remainingUrlsIndices))
                if len(remainingUrlsIndices) > 0:
                    gc.collect()
                    time.sleep(4)
                    remainingUrls = [remainingUrls[i] for i in remainingUrlsIndices]
                else:
                    remainingUrls = []
                
    loop = asyncio.get_event_loop()
    loop.run_until_complete(downloadUrls(commentUrls, data_frame))
    print('downloadExtendedData finished')

# provides possibility of saving data after exceptions
dataAll = pd.DataFrame()

def downloadData(resolutions = ['FIXED'], 
                 sDir = DATA_DIR_C, sName = 'bugData.csv', nMax = 100000,                 
                 columns = ["bug_id", "opendate", "cc_count", "keywords", "longdescs.count", "priority",
                            "classification", "product", "component", "bug_status", "resolution", "short_desc",
                            "rep_platform", "op_sys", "reporter", "version"],
                 components = {}):
    global dataAll
    resList = ["---", "FIXED", "INVALID", "WONTFIX", "DUPLICATE", "WORKSFORME", "INCOMPLETE",
               "SUPPORT", "EXPIRED", "MOVED"]
    columnsList = ["bug_id", "opendate", "cc_count", "keywords", "longdescs.count", "priority", "classification", 
                   "product", "component", "bug_status", "resolution", "short_desc",
                   "rep_platform", "op_sys", "reporter", "version"]
    if not os.path.isdir(sDir):
        return False
    baseUrl = 'https://bugzilla.mozilla.org/buglist.cgi?query_format=advanced&columnlist='
    
    bValid = False
    for colName in columns:
        if colName in columnsList:
            if bValid:
                baseUrl += ('%2C' + colName)
            else:
                baseUrl += colName
                bValid = True            
    if not bValid:
        return False
    
    bValid = False
    for res in resolutions:
        if res in resList:
            baseUrl += ('&resolution=' + res)
            bValid = True
    if not bValid:
        return False
    baseUrl += '&ctype=csv'
    
    bResult = False
    fileStarted = False
    retryCount = 5
    retryIndex = 0
    step = 1000
    offset = 0
    downloadedCount = -1
    dataAll = pd.DataFrame()
    sUrls = []
    if len(components) > 0:
        for sProduct, vComponents in components.items():
            for sComponent in vComponents:
                sUrls.append(baseUrl + '&product=' + sProduct + '&component=' + sComponent)
    else:
        sUrls = [baseUrl]
    print(len(components), len(sUrls))
    print(sUrls)
    for sUrl in sUrls:
        print('start loading for: ', sUrl)
        offset = 0
        downloadedCount = -1
        retryIndex = 0
        while (offset < nMax) and (downloadedCount != 0):
            try:
                sCurrentUrl = sUrl + '&limit=' + str(min(step, nMax - offset)) + '&offset=' + str(offset)
                print('Start downloading data', dt.datetime.now())
                r = requests.get(sCurrentUrl, allow_redirects=True)
               # with open(os.path.join(sDir, sName + '_Part.csv'), 'wb') as f:
               #     f.write(r.content)
               # dataPart = pd.read_csv(os.path.join(sDir, sName + '_Part.csv'), low_memory=False)
                dataPart = pd.read_csv(io.BytesIO(r.content), low_memory=False)
                dataPart['description'] = ''
                dataPart.set_index(keys = 'bug_id', drop = False, inplace = True)
                print('Start downloading extended data', dt.datetime.now())
                downloadExtendedData(dataPart)
                print('Finish downloading extended data', dt.datetime.now())
                if dataAll.empty:
                    dataAll = dataPart
                else:
                    dataAll = pd.concat([dataAll, dataPart], ignore_index=True)
                downloadedCount = dataPart.shape[0]
                offset += downloadedCount
                print('downloaded at step: ' + str(downloadedCount))
                print('downloaded all: ' + str(offset))
                dataPart = pd.DataFrame()
                retryIndex = 0
                print('Finish processing data', dt.datetime.now())
                gc.collect()
            except Exception as exc:
                print('error occured: ', exc)
                if offset > 0:
                    if fileStarted:
                        with open(os.path.join(sDir, sName), 'a', encoding='utf-8') as f:
                            dataAll.to_csv(f, header=False, encoding = 'utf-8')
                            print('append data shape = ', dataAll.shape)
                            dataAll = pd.DataFrame()
                    else:
                        dataAll.to_csv(os.path.join(sDir, sName), encoding = 'utf-8')
                        print('write data shape = ', dataAll.shape)
                        dataAll = pd.DataFrame()
                        fileStarted = True
                gc.collect()
                if downloadedCount < step:
                    retryIndex += 1
                    time.sleep(61)
                if retryIndex >= retryCount:
                    print('reached max retries count: ', retryCount)
                    break
        if offset > 0:
            if fileStarted:
                with open(os.path.join(sDir, sName), 'a', encoding='utf-8') as f:
                    dataAll.to_csv(f, header=False, encoding = 'utf-8')
                    print('append data shape = ', dataAll.shape)
                    dataAll = pd.DataFrame()
            else:
                dataAll.to_csv(os.path.join(sDir, sName), encoding = 'utf-8')
                print('write data shape = ', dataAll.shape)
                dataAll = pd.DataFrame()
                fileStarted = True
            gc.collect()
            print('data loaded for: ', sUrl)
            bResult = True
    return bResult




Next cell provides basic constants and definition of many functions for work with datasets and models.


In [None]:
MAX_SEQUENCE_LENGTH = 4000
MAX_NB_WORDS = 5000
EMBEDDING_DIM = 100

VALIDATION_SPLIT = 0.3
MIN_PRODUCT_DESCRIPTIONS = 50
MAX_PRODUCT_DESCRIPTIONS = 500000
MIN_COMPONENT_DESCRIPTIONS = 50
MAX_COMPONENT_DESCRIPTIONS = 500000
CLASS_PRODUCT_NAME = 'product'
CLASS_COMPONENT_NAME = 'component'

BaseComponents = {
                'Firefox': ['Untriaged', 'Developer Tools: Debugger', 'Session Restore', 'Developer Tools'],
                'Core': ['DOM', 'General', 'XPCOM', 'Widget: Gtk'],
                'Firefox for iOS': ['General'],
                'NSS': ['Libraries']
                 }

BugsWaterPhrases = [' So, it is a bug. ', ' Fix it! ', ' Something wrong. ', ' Found error. ', ' There is a problem. ']

def getKey(index, someDict):
    return [key for key, value in someDict.items() if value == index][0]

def clean_str2(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()    

def engorgio(datalist, nCount = 1000):
    n = len(datalist)
    if (n == 0) or (n <= nCount // (2**len(BugsWaterPhrases))):
        raise AssertionError('Cannot use engorgio for very small dataset with n= ' + str(n) + '; nCount = ' + str(nCount))
    k = nCount // n + 1
    extList = []
    for item in datalist:
        for i in range(2**k.bit_length()):
            #print('test1')
            baseItem = item
            bsItem = format(i, 'b')
            for j in range(len(bsItem)):
                if bsItem[j] == '1':
                    baseItem += BugsWaterPhrases[j]
            extList.append(baseItem)
        random.shuffle(extList)
    return random.sample(extList, nCount)

def reducio(datalist, nCount = 1000):
    return random.sample(datalist, nCount)

def balanse(datalist, nCount = 1000):
#    if len(datalist) > nCount:
#        return reducio(datalist, nCount)
#    else:
#        if len(datalist) < nCount:
#            return engorgio(datalist, nCount)
    return datalist

def from_categorical(y_labels):
    return np.argmax(y_labels, 1)

def dict_len(d1):
    nCount = 0
    for key, value in d1.items():
        nCount += len(value)
    return nCount

def dict_compare(d1, d2):
    d1_keys = set(d1.keys())
    d2_keys = set(d2.keys())
    intersect_keys = d1_keys.intersection(d2_keys)
    added = d1_keys - d2_keys
    removed = d2_keys - d1_keys
    modified = {o : (d1[o], d2[o]) for o in intersect_keys if d1[o] != d2[o]}
    same = set(o for o in intersect_keys if d1[o] == d2[o])
    return added, removed, modified, same

def dict_save(d1, dictName = 'dict_temp_save', sDir = DATA_DIR_C ):
    np.save(os.path.join(sDir, dictName + '.npy'), d1)    

def dict_load(dictName = 'dict_temp_save', sDir = DATA_DIR_C ):
    return np.load(os.path.join(sDir, dictName + '.npy')).item()

def dataset_save(x_train, y_train, x_val, y_val, x_test, y_test,
                 datasetName = 'current_dataset', sDir = DATA_DIR_C ):
    np.savez(os.path.join(sDir, datasetName + '.npz'),
             x_train = x_train,
             y_train = y_train,
             x_val = x_val,
             y_val = y_val,
             x_test = x_test,
             y_test = y_test)

# Warning: Unlike dataset_save, this function is more generalized, based on makeDataset.
# Returns lists with different structure, based on splitCount.
def dataset_load(datasetName = 'current_dataset', sDir = DATA_DIR_C, splitCount = 1 ):
    if splitCount > 0:
        if splitCount == 1:
            dataset = np.load(os.path.join(sDir, datasetName + '.npz'))
            return list(map(lambda x: dataset[x], dataset.files))
        else:
            resv = []
            for i in range(splitCount):
                dataset = np.load(os.path.join(sDir, datasetName + '_' + str(splitCount) + '_' + str(i) + '.npz'))
                resv.append(list(map(lambda x: dataset[x], dataset.files)))
            return resv
    else:
        return [dict_load(dictName = datasetName + '_data', sDir = sDir), 
                dict_load(dictName = datasetName + '_labels', sDir = sDir)]

def load_datasets(sDir = DATA_DIR_C):
    datasetsFiles = [f for f in os.listdir(sDir) if os.path.isfile(os.path.join(sDir, f)) and f.endswith('.npz')]
    datasets = {s[:-len('.npz')] : dataset_load(s[:-len('.npz')], sDir) for s in datasetsFiles}
    return datasets 


def embedding_bin2txt(sName = 'GoogleNews-vectors-negative300.bin', sDir = DATA_DIR_C):
    model = KeyedVectors.load_word2vec_format(os.path.join(sDir, sName), binary=True)
    model.wv.save_word2vec_format('GoogleNews-vectors-negative300.txt')
    del model

def embedding_matrix_load(wordIndex, sName = 'glove.6B.' + str(EMBEDDING_DIM) + 'd.txt',
                          sDir = DATA_DIR_C, binary = False):
    embeddings_index = {}
    word2vec = None
    if binary:
        word2vec = KeyedVectors.load_word2vec_format(os.path.join(sDir, sName), binary=True)   
    else:
        with open(os.path.join(sDir, sName), mode='r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
    embedding_matrix = np.random.random((len(wordIndex) + 1, EMBEDDING_DIM))
    if binary:
        for word, i in wordIndex.items():
            try:
                embedding_matrix[i, :] = word2vec[word]
            except KeyError:
                pass        
    else:
        for word, i in wordIndex.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
    gc.collect()
    return embedding_matrix
    
def model_save(model, modelName = 'model_temp', sDir = MODELS_DIR_C):    
    if modelName == '':
        modelName = model.name
    model.name = modelName
    model_json = model.to_json()
    with open(os.path.join(sDir, modelName + ".json"), "w") as json_file:
        json_file.write(model_json)
    model.save_weights(os.path.join(sDir, modelName + ".h5"))
    
def model_load(modelName = 'model_temp', sDir = MODELS_DIR_C, bCompile = True):    
    with open(os.path.join(sDir, modelName + '.json'), 'r') as json_file:
        loaded_model_json = json_file.read()
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights(os.path.join(sDir, modelName + ".h5"))
    if modelName != '':
        loaded_model.name = modelName
    if bCompile:
        loaded_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['acc'])
    return loaded_model
    
def save_models(models, sDir = MODELS_DIR_C):
    for name, model in models.items():
        if model != None:
            model_save(model, name, sDir)


def load_models(sDir = MODELS_DIR_C, sStartsWith = ''):
    modelsFiles = []
    if sStartsWith == '':
        modelsFiles = [f for f in os.listdir(sDir) if os.path.isfile(os.path.join(sDir, f)) and f.endswith('.json')]
    else:
        modelsFiles = [f for f in os.listdir(sDir) if (os.path.isfile(os.path.join(sDir, f)) and f.endswith('.json')
                                                       and f.startswith(sStartsWith))]
    models = {s[:-len('.json')] : model_load(s[:-len('.json')], sDir) for s in modelsFiles}
    modelsScripts = []
    if sStartsWith == '':
        modelsScripts = [f for f in os.listdir(sDir) if os.path.isfile(os.path.join(sDir, f)) and f.endswith('.py')]
    else:
        modelsScripts = [f for f in os.listdir(sDir) if (os.path.isfile(os.path.join(sDir, f)) and f.endswith('.py')
                                                         and f.startswith(sStartsWith))]
    for s in modelsScripts:
        models[s[:-len('.py')]] = None
    return models

def prepareData(sDir = DATA_DIR_C, sName = 'bugData.csv', maxClasses = 100, exactClasses = {},
                minProductDescriptions = MIN_PRODUCT_DESCRIPTIONS, 
                maxProductDescriptions = MAX_PRODUCT_DESCRIPTIONS,
                minComponentDescriptions = MIN_COMPONENT_DESCRIPTIONS,
                maxComponentDescriptions = MAX_COMPONENT_DESCRIPTIONS,
                otherPercents = 50,
                balanseVirtual = True):
    
    if otherPercents < 0 or otherPercents >= 100:
        raise AssertionError('invalid otherPercents value: ' + str(otherPercents))

    product_data = {}
    
    data_frame = pd.read_csv(os.path.join(sDir, sName), low_memory=False)
    print(data_frame.shape)
    print(data_frame.columns)
    data_frame.dropna(subset = ['description'], inplace = True)
    print(data_frame.shape)
    data_frame.drop([data_frame.columns[0]], axis = 1, inplace = True)
    print(data_frame.shape)
    print(data_frame.columns)
    
    productData = {}
    productKeys = {}
    
    exactKeyNames = []
    exactClasses
    for key, vClasses in exactClasses.items():
        for item in vClasses:
            exactKeyNames.append(key + ' - ' + item)
    
    prod1 = data_frame[CLASS_PRODUCT_NAME].value_counts()
    print(prod1.shape)
    data_frame[data_frame[CLASS_PRODUCT_NAME].isin(prod1.where(prod1 < minProductDescriptions)
                                          .dropna().index.tolist())] = 'Other ' + CLASS_PRODUCT_NAME + 's'
    prod1 = data_frame[CLASS_PRODUCT_NAME].value_counts()
    print(prod1)
    prodNames = prod1.index.tolist()
    for item in prod1.where(prod1 > maxProductDescriptions).dropna().index.tolist():
        prodNames.remove(item)
    n1 = len(prodNames)
    
    productData = { name: data_frame[data_frame[CLASS_PRODUCT_NAME] == name] for name in prodNames }

    keyIndex = 0
    product_data['Virtual Class'] = []
    for name, prod_frame in productData.items():
        comp1 = prod_frame[CLASS_COMPONENT_NAME].value_counts()
        prod_frame[prod_frame[CLASS_COMPONENT_NAME].isin(comp1.where(comp1 < minComponentDescriptions)
                                              .dropna().index.tolist())] = 'Other ' + CLASS_COMPONENT_NAME + 's'
        comp1 = prod_frame[CLASS_COMPONENT_NAME].value_counts()
        compNames = comp1.index.tolist()
        for item in comp1.where(comp1 > maxComponentDescriptions).dropna().index.tolist():
            compNames.remove(item)
        n2 = len(compNames)
        for key in compNames:
            items = [' ||| '.join([x1, x2, x3, x4, x5]) for x1,x2,x3,x4,x5 in zip(
                        balanse(prod_frame[prod_frame[CLASS_COMPONENT_NAME] == key]['reporter'].values.tolist()),
                        balanse(prod_frame[prod_frame[CLASS_COMPONENT_NAME] == key]['short_desc'].values.tolist()),
                        balanse(prod_frame[prod_frame[CLASS_COMPONENT_NAME] == key]['op_sys'].values.tolist()),
                        balanse(prod_frame[prod_frame[CLASS_COMPONENT_NAME] == key]['rep_platform'].values.tolist()),
                        balanse(list(map(lambda s: str(s), prod_frame[prod_frame[CLASS_COMPONENT_NAME] == key]['description']
                                         .values.tolist()))))]
            if len(exactKeyNames) > 0:
                if name + ' - ' + key in exactKeyNames:                
                    product_data[name + ' - ' + key] = items
                else:
                    if len(product_data['Virtual Class']) > 0:
                        product_data['Virtual Class'].extend(items)
                    else:
                        product_data['Virtual Class'] = items
            else:
                if key != 'Other ' + CLASS_COMPONENT_NAME + 's':
                    product_data[name + ' - ' + key] = items
                else:
                    if len(product_data['Virtual Class']) > 0:
                        product_data['Virtual Class'].extend(items)
                    else:
                        product_data['Virtual Class'] = items
                
            productKeys[name + ' - ' + key] = keyIndex
            keyIndex += 1
            
    if keyIndex <= maxClasses:
        if balanseVirtual:
            baseDataCount = 0
            otherDataCount = 0
            for key, texts in product_data.items():
                if key == 'Virtual Class':
                    otherDataCount += len(texts)
                else:
                    baseDataCount += len(texts)
            print('baseDataCount = ', baseDataCount)
            print('otherDataCount = ', otherDataCount)
            # virtualCount / (virtualCount + baseDataCount) = otherPercents / 100
            virtualCount = int(((baseDataCount * otherPercents) / 100.0) / (1.0 - otherPercents / 100.0))
            print('virtualCount = ', virtualCount)
            if otherDataCount > virtualCount:
                print('balansing...')
                product_data['Virtual Class'] = random.sample(product_data['Virtual Class'], virtualCount)
    else:
#        base_product_data = dict(sorted(product_data.items(), key=lambda k: len(k[1]), reverse=True)[:maxClasses])
        other_product_data = dict(sorted(product_data.items(), key=lambda k: len(k[1]), reverse=True)[maxClasses+1:])
        #list(map(product_data['Virtual Class'].extend, other_product_data))
        for value in other_product_data.values():
            product_data['Virtual Class'].extend(value)
        product_data = dict(sorted(product_data.items(), key=lambda k: len(k[1]), reverse=True)[:maxClasses])
        if balanseVirtual:
            otherDataCount = len(product_data['Virtual Class'])
            baseDataCount = dict_len(product_data) - otherDataCount
            print('baseDataCount = ', baseDataCount)
            print('otherDataCount = ', otherDataCount)
            # virtualCount / (virtualCount + baseDataCount) = otherPercents / 100
            virtualCount = int(((baseDataCount * otherPercents) / 100.0) / (1.0 - otherPercents / 100.0))
            print('virtualCount = ', virtualCount)
            if otherDataCount > virtualCount:
                print('balansing...')
                product_data['Virtual Class'] = random.sample(product_data['Virtual Class'], virtualCount)
            
        print({key: len(value) for key, value in product_data.items() })
        productKeys = {list(product_data.keys())[i] : i for i in range(len(product_data))}
        print(productKeys)

    data_frame = pd.DataFrame()
    gc.collect()
    print('prepareData finished')
    return product_data, productKeys

def makeDataset(data_dict, productKeys, datasetName = 'current_dataset', sDir = DATA_DIR_C,
                splitCount = 1, splitVirtualOnly = False, maxCount = 0):
    texts = []
    labels = []
    print('productKeys len = ', len(productKeys))
    print('data_dict len = ', len(data_dict))
    
    if splitVirtualOnly == False:
        for key, value in data_dict.items():
            for sDesc in value:
                text = BeautifulSoup(sDesc, "lxml")
                sText = clean_str2(text.get_text())
                if sText not in texts:
                    texts.append(sText)
                    labels.append(productKeys[key])
            print('converting text for key: ' + str(key) + ' with len = ' + str(len(value)) + ' finished')

        tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)

        word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))

        data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

        labels = to_categorical(np.asarray(labels))
        print('Shape of data tensor:', data.shape)
        print('Shape of label tensor:', labels.shape)

        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        if maxCount > 0 and indices.shape[0] > maxCount:
            indices = random.sample(indices, maxCount)
        data = data[indices]
        labels = labels[indices]
        nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

        if splitCount > 0:
            x_train = data[:-nb_validation_samples]
            y_train = labels[:-nb_validation_samples]
            x_val = data[-nb_validation_samples: -(nb_validation_samples // 2)]
            y_val = labels[-nb_validation_samples: -(nb_validation_samples // 2)]
            x_test = data[-(nb_validation_samples // 2):]
            y_test = labels[-(nb_validation_samples // 2):]
            dataset_save(x_train, y_train, x_val, y_val, x_test, y_test,
                         datasetName=datasetName, sDir = sDir)
            if splitCount > 1:
                def makePart(X, n, i):
                    return X[(i*len(X))//n : ((i+1)*len(X))//n]
                for i in range(splitCount):
                    dataset_save(makePart(x_train, splitCount, i),
                                 makePart(y_train, splitCount, i),
                                 makePart(x_val, splitCount, i),
                                 makePart(y_val, splitCount, i),
                                 makePart(x_test, splitCount, i),
                                 makePart(y_test, splitCount, i),
                                 datasetName=datasetName + '_' + str(splitCount) + '_' + str(i),
                                 sDir = sDir)
        else:
            dict_save(data, datasetName + '_data', sDir = sDir)
            dict_save(labels, datasetName + '_labels', sDir = sDir)

        dict_save(word_index, datasetName + '_word_index', sDir = sDir)
    else:
        raise NotImplementedError('''May be used in future to create ensembles with model, 
                                     trained on few datasets with equal base classes,
                                     but different parts of virtual classes data''')
    
    print('makeDataset finished')
#    print([np.array_equal(x, y) for x,y in zip([x_train, y_train, x_val, y_val, x_test, y_test], dataset_load())])
    


def testModel(resultsFrame, modelName = 'current_model', datasetName = 'current_dataset',
              dataDir = DATA_DIR_C, 
              modelsDir = MODELS_DIR_C):
    bModelReady = (os.path.exists(os.path.join(modelsDir, modelName + '.json')) and 
                   os.path.exists(os.path.join(modelsDir, modelName + '.h5')))
    if bModelReady:
        model = model_load(modelName=modelName, sDir=modelsDir)
        x_train, y_train, x_val, y_val, x_test, y_test = dataset_load(datasetName=datasetName, sDir=dataDir)
        try:
            scores = model.evaluate(x_test, y_test)
            resultsFrame.set_value(modelName, datasetName, scores[1])
        except Exception as exc:
            print(exc)
    else:
        try:
            mfile = importlib.import_module(name = modelName)
            importlib.reload(mfile)
            try:
                mfile.set_params(params)
            except Exception as exc2:
                print(exc2)
            resultsFrame.set_value(modelName, datasetName, mfile.test_model(modelName, datasetName, x_test, y_test))
        except Exception as exc:
            print(exc)
    
       
def testModels(dataDir = DATA_DIR_C, modelsDir = MODELS_DIR_C):
    datasets = load_datasets(dataDir)
    models = load_models(modelsDir)
    
    resultsFrame = pd.DataFrame(columns = [datasetName for datasetName, _ in datasets.items()])

    for datasetName, [x_train, y_train, x_val, y_val, x_test, y_test] in datasets.items():
        for modelName, model in models.items():
            try:
                if model != None:
                    scores = model.evaluate(x_test, y_test)
                    resultsFrame.set_value(modelName, datasetName, scores[1])
                else:
                    if os.path.exists(os.path.join(modelsDir, modelName + '_' + datasetName + '.json')):
                        model = load_model(modelName + '_' + datasetName, modelsDir)
                        scores = model.evaluate(x_test, y_test)
                        resultsFrame.set_value(modelName, datasetName, scores[1])
                    else:
                        if os.path.exists(modelsDir) and (modelsDir not in sys.path):
                            sys.path.append(modelsDir)
                        mfile = importlib.import_module(name = modelName)
                        importlib.reload(mfile)
                        resultsFrame.set_value(modelName, datasetName, mfile.test_model(modelName, datasetName, 
                                                                                        x_test, y_test))
            except Exception as exc:
                print(exc)
            #gc.collect()
    
    resultsFrame.to_csv(os.path.join(dataDir, 'current_results.csv'))
    print()
    print(resultsFrame)
    
    print('testModels finished')
    
def reloadSession():
    K.clear_session()
    gc.collect()
    cfg = K.tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    K.set_session(K.tf.Session(config=cfg))

def trainModel(modelName = 'current_model', datasetName = 'current_dataset', 
               dataDir = DATA_DIR_C,
               modelsDir = MODELS_DIR_C):
    x_train, y_train, x_val, y_val, x_test, y_test = dataset_load(datasetName, sDir=dataDir)
    word_index = dict_load(datasetName + '_word_index', sDir=dataDir)
    
    embedding_matrix = embedding_matrix_load(word_index, sDir=dataDir) 
#                                             sName='GoogleNews-vectors-negative300.bin', binary=True)
    
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    
#'model': 'model_opt_py_dataset_exact10_4000_100d_64_0.1_3_256_15_nadam_local_same_LeakyRelu_0.2', 
#'params': ['64', '0.1', '3', '256', '15', 'nadam', 'local', 'same', 'LeakyRelu', '0.2'], 
#'results': [0.554242658161845, 0.8583016476854777]

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    l_sdrop1 = SpatialDropout1D(0.15)(embedded_sequences)
    l_conv1 = Conv1D(256, 3, padding='same')(l_sdrop1)
    l_act1 = LeakyReLU(0.2)(l_conv1)
    l_sdrop2 = SpatialDropout1D(0.15)(l_act1)
    l_pool1 = MaxPooling1D(MAX_SEQUENCE_LENGTH )(l_sdrop2)
    l_flat = Flatten()(l_pool1)
    preds = Dense(y_val.shape[1], activation='softmax')(l_flat)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='nadam',
                  metrics=['acc'])

    model.summary()
    try:
        model.fit(x_train, y_train, validation_data=(x_val, y_val),
                  epochs=5, batch_size=32,
                  callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.01, patience=2, verbose=1, 
                                                             mode='auto')])
        print(model.evaluate(x_test, y_test))
        model_save(model, modelName, sDir=modelsDir)
    except Exception as exc:
        print(exc)
        model_save(model, modelName + '_temp', sDir=modelsDir)    
    print('trainModel finished')
    del model
    reloadSession()
  
    
    
print('ready')



Next cell for downloading bugs data from bugzilla.
Since we provided the data along with the script, you can skip this step


In [None]:
#downloadData(resolutions = ['FIXED', 'WONTFIX'], sName = 'bugDataExactComponents100_3.csv', 
#             nMax = 100000, components=BaseComponents)


Next cell defines current dataset and model names.
May be used to create more datasets with different parameters and for in-place training and testing models.


In [None]:
CurrentDatasetName = 'dataset_global50_exv4' + '_' + str(MAX_SEQUENCE_LENGTH) + '_' + str(EMBEDDING_DIM) + 'd'
CurrentModelName = 'cnn50GlobalExv4' + '_' + str(MAX_SEQUENCE_LENGTH) + '_' + str(EMBEDDING_DIM) + 'd'

# maxClasses = 11 (10 + virtual class with other components)
product_data, product_keys = prepareData(sName='bugDataTest500.csv', maxClasses=51, 
                                         minComponentDescriptions=50,
                                         minProductDescriptions=50)

makeDataset(datasetName=CurrentDatasetName,
            data_dict=product_data, productKeys=product_keys)

#reloadSession()
trainModel(datasetName=CurrentDatasetName, modelName=CurrentModelName)

#testModels()


In [None]:
# temp parts

#resListAll = ["FIXED", "INVALID", "WONTFIX", "DUPLICATE", "WORKSFORME", "INCOMPLETE",
#               "SUPPORT", "EXPIRED", "MOVED"]
#downloadData(resolutions = ['FIXED', 'WONTFIX'], sName = 'bugDataTest500_2.csv', nMax = 500000)

#resultsFrame = pd.DataFrame(columns = ['dataset_exact10'])
#testModel(resultsFrame=resultsFrame, datasetName='dataset_exact10', modelName='model_opt_py')
#print()
#print(resultsFrame)
#makeDataset(datasetName='ExampleSet', data_dict=product_data, productKeys=product_keys)
#trainModel(datasetName='ExampleSet', modelName='ExampleModel')

#product_data, product_keys = prepareData(sName='bugDataTest500.csv', exactClasses=BaseComponents)

#product_data, product_keys = prepareData(sName='bugDataExactComponents100_2.csv', exactClasses=BaseComponents)


Next cell provide functionality of automatically training and saving models, selecting best model from defined parameters.
This cell can interact with any model, defined in python file which has next methods:

def set_params(params):

def test_model(modelName, datasetName, px_test, py_test, sequenceInput = None):

test_model function must return float value score.

There are a few models, provided with this notebook:

model_opt_py (CNN), model_rnn_py (RNN), model_rcnn_py (CNN + LSTM).

They can be used in selectModel function in this cell.

In [None]:
'''
def getModelParamData(s):
    kStart = s.find('(')
    kEnd = s.rfind(')')
    res = []
    if kStart != -1 and kEnd != -1 and (kEnd > kStart + 1):
        res.append(s[:kStart])
        res += s[kStart + 1: kEnd].split(',')
    else:
        res.append(s)
    return res
'''

#CurrentDatasetName = 'dataset_exact10' + '_' + str(MAX_SEQUENCE_LENGTH) + '_' + str(EMBEDDING_DIM) + 'd'
#CurrentModelName = 'cnnOptExact10Local' + '_' + str(MAX_SEQUENCE_LENGTH) + '_' + str(EMBEDDING_DIM) + 'd'

#reloadSession()
ParamsVDict = {'batch_size': [32],
               'dropout': [0.15, 0.2],
               'conv_size': [3],
               'epochs': [6],
               'optimizer': ['nadam'], #rmsprop
               'conv_filters': [256],               
               'pooling': ['global'], #local
               'padding': ['same'], #valid
               'activation': ['relu', 'LeakyRelu(0.1)', 'LeakyRelu(0.2)'],
               'lstm_units': [80],
               'use_pretrained': [1], # use pretrained glove vector (1) or train from zero (0).
               'name_suffix': ['suffixCnne1'] # just a name suffix. May be used to train same models few times.
              }

def getParams(sName, index = 0, ModelParams = ParamsVDict):
    return ModelParams[sName][index].split('_')

# In this function, you can select name of python file with model, which must contains a few functions
# def set_params(params):
# def test_model(modelName, datasetName, px_test, py_test, sequenceInput = None): # must return float score between 0 and 1.
# function will find best parameters for model to get best score.
def selectModel(modelName = 'model_test_py', datasetName = 'dataset_exact10', modelsDir = MODELS_DIR_C):

    # lengths of param vectors
    # 2 3 3 2
    ldata = [len(v) for name, v in ParamsVDict.items()]
    nMax = reduce(lambda x, y: x * y, ldata)
    # products of lengths of param vectors in reverse order except first number.
    # 3*3*2 3*2 2
    ldata2 = [reduce(lambda x, y: x * y, ldata[len(ldata):n1:-1], 1) for n1 in range(0, len(ldata) - 1)]
    
 #   print('ldata: ', ldata)
 #   print('nMax = ', nMax)
 #   print('ldata2: ',ldata2)
    
    # transform index to sequence of params from ParamsVDict
    # this process is similar to transforming value to other number system (like factorial number system)
    def getNextParams(index, paramsDict = ParamsVDict):
   #     ldata = [len(v) for name, v in paramsDict.items()]
   #     nMax = reduce(lambda x, y: x * y, ldata)
   #     ldata2 = [reduce(lambda x, y: x * y, ldata[len(ldata):n1:-1], 1) for n1 in range(0, len(ldata) - 1)]
        if index >= nMax or index < 0:
            print('getNextParams: index is out of range ', index)
            return {}
        res = {}
        itemIndex = 0
        num = index
        print('num = ', num)
        for item, v in paramsDict.items():
            if itemIndex < len(ldata2):
                k = num // ldata2[itemIndex]
                print(itemIndex, k)
                res[item] = v[k]
                num -= k * ldata2[itemIndex]
            else:
                res[item] = v[num]
            itemIndex += 1
        print('res = ',res)
        return res
    
    _,_,_,_, x_test, y_test = dataset_load(datasetName)

    if modelsDir not in sys.path:
        sys.path.append(modelsDir)
    mfile = importlib.import_module(modelName)

    bestScore = 0
    bestParams = {}
    index = 0
    while index < nMax:
        currentParams = getNextParams(index)
        if (len(currentParams) > 0):
            print(currentParams)
            importlib.reload(mfile)
            try:
                mfile.set_params(currentParams)
            except Exception as exc:
                print('unsupported setting parameters. Finish after first iteration.')
                if index > 0:
                    break
            curScore = mfile.test_model(modelName, datasetName, x_test, y_test)
            if curScore > bestScore:
                bestScore = curScore
                bestParams = currentParams
                print(bestScore)
            gc.collect()
        index += 1
        if len(currentParams) <= 0:
            break
            
    print(bestScore)
    print(bestParams)
    
selectModel(modelName = 'model_opt_py', datasetName=CurrentDatasetName)


The previous cell provides possibility to select best parameters for almost any model. 
However, the next cell provides ability to collect and save selected models information, but only for Keras models, assuming, that models parameters contained in models names, divided by symbol '_'.


In [None]:

def makeModelName(params, modelName = 'model_opt_py', datasetName = CurrentDatasetName):
    sfName = modelName + '_' + datasetName
    for key, value in params.items():
        sfName += ('_' + str(value))
    return sfName

def calc_models_variants(x_test, y_test, sDir = MODELS_DIR_C, baseModelNames = ['model_opt_py'],
                         params=ParamsVDict, sNamePart = ''):
    vmodels = []
    if len(baseModelNames) == 0:
        print('Warning: calc_model_variants: empty base models list')
        return []
    for baseModelName in baseModelNames:
        modelsFiles = []
        if baseModelName == '':
            continue
#            modelsFiles = [f for f in os.listdir(sDir) if os.path.isfile(os.path.join(sDir, f)) and f.endswith('.json')]
        else:
            modelsFiles = [f for f in os.listdir(sDir) if (os.path.isfile(os.path.join(sDir, f))
                                                           and f.endswith('.json')
                                                           and f.startswith(baseModelName)
                                                           and (True if sNamePart == '' else sNamePart in f))]
        index = 0
        # models element description: 
        # {'model' : loaded keras model, 'params' : [params list], 'results': [scores list] }
        shift1 = 0
        for s in modelsFiles:
            item = {}
            model = model_load(s[:-len('.json')], sDir, bCompile=False)
    #        model.name = s[:-len('.json')]
            item['path'] = sDir
            item['model'] = s[:-len('.json')]
            if s[len(baseModelName)] == '_':
                shift1 = 1
            else:
                shift1 = 0
            item['params'] = s[len(baseModelName) + shift1 : -len('.json')].split('_')
            sOptimizer = 'nadam'
            if 'rmsprop' in item['params']:
                sOptimizer = 'rmsprop'
            model.compile(loss='categorical_crossentropy',
                          optimizer=sOptimizer,
                          metrics=['acc'])
            try:
                item['results'] = model.evaluate(x_test, y_test)
                vmodels.append(item)
            except Exception as exc:
                print('Cannot evaluate model with current test set: ' + s[:-len('.json')])
                print(exc)
            del model
            index += 1
            if (index % 10 == 0):
                reloadSession()
        reloadSession()
    return vmodels
    
def writeModelsData(modelsData, fData = 'modelsInfo.txt', sDir = LOGS_DIR_C, bHistory = True):
    if bHistory:
        with open(os.path.join(sDir, fData), 'at') as f:
            for modelData in modelsData:
                json.dump(modelData, f, indent=4)
                print(file=f)
    else:
        with open(os.path.join(sDir, fData), 'wt') as f:
            json.dump(modelsData, f)
            
def loadModelsData(fData = 'modelsData.json', sDir = DATA_DIR_C):
    with open(os.path.join(sDir, fData), 'rt') as f:
        return json.load(f)
            

_,_,_,_, x_test, y_test = dataset_load(CurrentDatasetName)

reloadSession()

historyFile = 'modelsInfo.txt'

#BaseModelNames = ['model_rcnn_py', 'model_opt_py', 'model_rnn_py']
BaseModelNames = ['model_opt_py']

def printBaseModelInfo():
    with open(os.path.join(LOGS_DIR_C, historyFile), 'at') as f:
        print(file=f)
        print('Models based on: ', file=f)
        print(BaseModelNames, file=f)
        print('Used dataset: ' + CurrentDatasetName, file=f)

# Disable it for temp test models or models, which has been already documented
printBaseModelInfo()       

modelsData = calc_models_variants(x_test, y_test, baseModelNames=[baseModelName + '_' + CurrentDatasetName + '_' 
                                                  for baseModelName in BaseModelNames])
writeModelsData(modelsData, historyFile, LOGS_DIR_C)
