# Lesson 05 - Basic Neural Network

In [None]:
import pandas as pd
import numpy as np

import sklearn

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.pipeline import Pipeline
from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, RobustScaler

from keras.models import Model
from keras.optimizers import Adam
from keras.layers import TimeDistributed, GlobalAveragePooling1D, Activation, Input, LSTM, GRU, Dense, Dropout, Flatten, Embedding, SpatialDropout1D, Bidirectional, CuDNNGRU
from keras.layers.convolutional import Conv1D, MaxPooling1D, AveragePooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers import Flatten, concatenate
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, activations
from keras.layers.recurrent import Recurrent
from keras.engine import InputSpec
from keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau, CSVLogger

import tensorflow as tf
from tensorflow import set_random_seed
from keras import backend as K

# This part required only for GPU
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.allocator_type = 'BFC'
sess = tf.Session(config=config)
K.set_session(sess)


seed = 10293239

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    np.set_printoptions(precision=2)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


## Load data and basic info

Let's load the same dataset as in Lesson 01.

In [None]:
bugs = pd.read_csv('./data/bugs_train.csv', parse_dates=['Opened', 'Changed'], index_col=None)

In [None]:
bugs.head(4)

## The classification task (the problem to solve)

Our task remains the same for this lesson - we would be to predict what will be the resolution of the defect report (y) based on the description of a defect (X). 

## Data preparation (features)

Let's quickly replicate processing of the Component and Severity features, as well as converting the decision class.

In [None]:
# we will make a copy of the main data
bugs_small = bugs[["Assignee", "Component", "Severity", "Status", "Priority", "Opened", "Changed", "Summary", "Resolution"]]

# Component
bugs_small = pd.get_dummies(bugs_small, columns=['Component'], prefix="Component")

# Severity
bugs_small['Severity'] = bugs_small['Severity'].map(
    {'enhancement':0, 'trivial':1, 'minor':2, 'normal':3, 'major':4, 'critical':5, 'blocker':6})

# Status
bugs_small['Status'] = bugs_small['Status'].map(
    {'VERIFIED':0, 'RESOLVED':1, 'CLOSED':2})

# Priority
bugs_small['Priority'] = bugs_small['Priority'].map(
    {'P1':1, 'P2':2, 'P3':3, 'P4':4, 'P5':5})


y = bugs_small['Resolution']
X = bugs_small.drop(['Resolution'], axis=1, inplace=False)


# Days
X['Days'] = X.apply(lambda x: (x.Changed - x.Opened).days, axis=1)
X.drop(["Changed", "Opened"], inplace=True, axis=1)

# Summary as BoW
X['Summary'] = X['Summary'].fillna('')
from sklearn.feature_extraction.text import CountVectorizer

# create CountVectorizer class; we take only 30 most frequently appearing features
count_vect = CountVectorizer(max_features=30, stop_words="english")

# CountVectorizer fit method extracts vocabulary while transform performs the transformation. There is also
# the method fit_transform that does both.
bag_of_words = count_vect.fit_transform(list(X['Summary'])).todense()

# We create a list of names of columns 
colnames = ["Summary_"+x for x in sorted(count_vect.vocabulary_.keys())]

# Finally, we create a dataframe with bag of words features
summary_bow = pd.DataFrame(bag_of_words, columns=colnames)
X = pd.concat([X.reset_index(drop=True), summary_bow], axis=1)
X.drop(["Summary"], inplace=True, axis=1)

# Assignee
inbox = [1 if x.endswith('-inbox') else 0 for x in bugs_small['Assignee']]
X.insert(loc=0, column='Assignee_Inbox', value=pd.Series(inbox))
    
eclipse = [1 if x.endswith('eclipse') else 0 for x in bugs_small['Assignee']]
X.insert(loc=0, column='Assignee_Eclipse', value=pd.Series(eclipse))
X.drop(["Assignee"], inplace=True, axis=1)


In [None]:
X.head(4)

Let's simplify the problem to binary classification => resulolution FIXED, NOT FIXED

In [None]:
y_binary = np.array([1 if x == "FIXED" else 0 for x in y])
y_binary

## A single layer multiple perceptron

In [None]:
def get_model(n_features=None):
    
    global seed
    np.random.seed(seed)
    set_random_seed(seed)
    
    main_input = Input(shape=(n_features,), name='input')
    output = Dense(1, activation="sigmoid")(main_input)
    
    model = Model([main_input] , output)
    algorithm = Adam(lr=0.005, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=algorithm, loss='binary_crossentropy', metrics=['accuracy'])

    return model

def get_model_1l(n_features=None):
    
    global seed
    np.random.seed(seed)
    set_random_seed(seed)
    
    main_input = Input(shape=(n_features,), name='input')
    model = Dense(10, activation="relu")(main_input)
    model = Dropout(0.2)(model)  
    output = Dense(1, activation="sigmoid")(model)
    
    model = Model([main_input] , output)
    algorithm = Adam(lr=0.005, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=algorithm, loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
np.random.seed(seed)
set_random_seed(seed)

callbacks_list = [
    ReduceLROnPlateau( 
        monitor='loss',
        min_lr=0.001, 
        factor=0.5,
        verbose=1,
        patience=10) 
]

pipeline = []
pipeline.append(('minmax', MinMaxScaler()))
pipeline.append(('classifier', KerasClassifier(build_fn=get_model,  
                                        epochs=20,
                                        batch_size=128, 
                                        verbose=2, 
                                        callbacks=callbacks_list,
                                        n_features=X.shape[1]))) 

model = Pipeline(pipeline)

Let's perform some simple evaluation using test-train split strategy (it will be much faster than waiting for CV).

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.33, random_state=seed, stratify=y_binary)

In [None]:
model.fit(X_train, y_train)

Let's predict the output.

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro')
rec = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

"Accuracy = {:.3f}, Precision = {:.3f}, Recall = {:.3f}, F1-score = {:.3f}".format(acc, prec, rec, f1)

In [None]:
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
plot_confusion_matrix(cnf_matrix, classes=("NOT FIXED", "FIXED"),
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.subplot(1, 2, 2)
plot_confusion_matrix(cnf_matrix, classes=("NOT FIXED", "FIXED"), normalize=True,
                      title='Normalized confusion matrix')