In [1]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Input, Embedding, Dense

np.random.seed(7)
%matplotlib inline 

In [23]:
# PeriodicLogger — helper logger class at the end of each epoch

class PeriodicLogger(Callback):
    """
    A helper callback class that only prints the losses once in 'display' epochs
    """    
    def __init__(self, display=100):
        self.display = display
    def on_train_begin(self, logs={}):
        self.epochs = 0
    def on_epoch_end(self, batch, logs={}):
        self.epochs += 1
        if self.epochs % self.display == 0:
            print("Epoch: %d - loss: %f - val_loss: %f" % (self.epochs, logs['loss'], logs['val_loss']))

periodic_logger_250 = PeriodicLogger(250)

### loading, cleaning data & features engineering

In [2]:
orig_data = pd.read_csv('../SiP_dataset-master/Sip-task-info.csv', encoding='cp1252') 
orig_data.head()

Unnamed: 0,TaskNumber,Summary,Priority,RaisedByID,AssignedToID,AuthorisedByID,StatusCode,ProjectCode,ProjectBreakdownCode,Category,SubCategory,HoursEstimate,HoursActual,DeveloperID,DeveloperHoursActual,TaskPerformance,DeveloperPerformance
0,1735,Flag RI on SCM Message Summary screen using me...,1,58,58,6.0,FINISHED,PC2,PBC42,Development,Enhancement,14.0,1.75,58,1.75,12.25,12.25
1,1742,Allow RI Policies to be marked as Exhausted,1,58,42,6.0,FINISHED,PC2,PBC21,Development,Enhancement,7.0,7.0,42,7.0,0.0,0.0
2,1971,Fix Invalid UWREF Line DX402L99A1N,2,7,58,6.0,FINISHED,PC2,PBC75,Operational,In House Support,0.7,0.7,58,0.7,0.0,0.0
3,2134,New rows in the diary event for the SCM are re...,5,50,42,6.0,FINISHED,PC2,PBC42,Development,Bug,0.7,0.7,42,0.7,0.0,0.0
4,2251,Application Screen Size - Need to set Min Size...,10,46,13,6.0,FINISHED,PC2,PBC21,Development,Bug,3.5,3.5,13,3.5,0.0,0.0


In [3]:
orig_data.describe()

Unnamed: 0,TaskNumber,Priority,RaisedByID,AssignedToID,AuthorisedByID,HoursEstimate,HoursActual,DeveloperID,DeveloperHoursActual,TaskPerformance,DeveloperPerformance
count,12299.0,12299.0,12299.0,12299.0,4265.0,12299.0,12299.0,12299.0,12299.0,12299.0,10200.0
mean,8932.838686,2.176356,37.103017,37.296772,45.667526,10.152034,13.175468,36.627449,7.724366,-3.023434,0.467541
std,3271.407366,1.720049,19.85953,18.823674,18.773525,28.841783,68.724667,18.958718,36.015641,67.219867,38.666102
min,1735.0,1.0,1.0,1.0,6.0,0.01,0.01,1.0,0.01,-2469.16,-2079.5
25%,6188.5,1.0,16.0,22.0,43.0,1.0,1.0,13.0,1.0,-1.75,0.0
50%,9090.0,1.0,42.0,42.0,58.0,3.0,3.0,42.0,2.0,0.0,0.0
75%,11766.5,3.0,58.0,58.0,58.0,7.0,8.5,58.0,5.25,0.5,1.0
max,14377.0,10.0,67.0,65.0,58.0,910.0,2490.16,65.0,2100.5,701.14,709.84


In [4]:
relevent_attributes=["Priority","RaisedByID","AssignedToID","AuthorisedByID",\
                     "StatusCode","ProjectCode","Category","SubCategory","HoursEstimate",\
                     "HoursActual"]

relevent_data = orig_data[relevent_attributes]
relevent_data.head()

Unnamed: 0,Priority,RaisedByID,AssignedToID,AuthorisedByID,StatusCode,ProjectCode,Category,SubCategory,HoursEstimate,HoursActual
0,1,58,58,6.0,FINISHED,PC2,Development,Enhancement,14.0,1.75
1,1,58,42,6.0,FINISHED,PC2,Development,Enhancement,7.0,7.0
2,2,7,58,6.0,FINISHED,PC2,Operational,In House Support,0.7,0.7
3,5,50,42,6.0,FINISHED,PC2,Development,Bug,0.7,0.7
4,10,46,13,6.0,FINISHED,PC2,Development,Bug,3.5,3.5


In [5]:
# filling emtpy cells with 0
relevent_data = relevent_data.fillna(0)
relevent_data.describe()

Unnamed: 0,Priority,RaisedByID,AssignedToID,AuthorisedByID,HoursEstimate,HoursActual
count,12299.0,12299.0,12299.0,12299.0,12299.0,12299.0
mean,2.176356,37.103017,37.296772,15.836409,10.152034,13.175468
std,1.720049,19.85953,18.823674,24.385592,28.841783,68.724667
min,1.0,1.0,1.0,0.0,0.01,0.01
25%,1.0,16.0,22.0,0.0,1.0,1.0
50%,1.0,42.0,42.0,0.0,3.0,3.0
75%,3.0,58.0,58.0,43.0,7.0,8.5
max,10.0,67.0,65.0,58.0,910.0,2490.16


In [6]:
# split df to train & val dfs
msk = np.random.rand(len(relevent_data)) < 0.8
train = relevent_data[msk]
val = relevent_data[~msk]

In [7]:
train.describe()

Unnamed: 0,Priority,RaisedByID,AssignedToID,AuthorisedByID,HoursEstimate,HoursActual
count,9820.0,9820.0,9820.0,9820.0,9820.0,9820.0
mean,2.177291,37.169552,37.408248,15.888798,9.979885,12.619523
std,1.726613,19.93165,18.848608,24.425895,28.810684,59.671232
min,1.0,1.0,1.0,0.0,0.01,0.01
25%,1.0,16.0,22.0,0.0,1.0,1.0
50%,1.0,42.0,42.0,0.0,3.0,3.0
75%,3.0,58.0,58.0,43.0,7.0,8.5
max,10.0,67.0,65.0,58.0,910.0,2490.16


In [8]:
val.describe()

Unnamed: 0,Priority,RaisedByID,AssignedToID,AuthorisedByID,HoursEstimate,HoursActual
count,2479.0,2479.0,2479.0,2479.0,2479.0,2479.0
mean,2.17265,36.839451,36.855184,15.628883,10.833965,15.377717
std,1.694137,19.572985,18.721839,24.22907,28.960418,96.565929
min,1.0,1.0,1.0,0.0,0.05,0.05
25%,1.0,16.0,13.0,0.0,1.0,1.0
50%,1.0,42.0,42.0,0.0,3.0,3.0
75%,3.0,58.0,58.0,43.0,7.0,8.725
max,10.0,66.0,65.0,58.0,700.0,2490.16


In [9]:
# Define the features and the Y vectors
continuous_cols = ['HoursEstimate']
categorical_cols = ['Priority', 'RaisedByID', 'AssignedToID', 'AuthorisedByID',
                    'StatusCode', 'ProjectCode', 'Category', 'SubCategory']
y_col = ['HoursActual']

In [10]:
# separate the continuous and categorical data
X_train_continuous = train[continuous_cols]
X_train_categorical = train[categorical_cols]
y_train = train[y_col]

X_val_continuous = val[continuous_cols]
X_val_categorical = val[categorical_cols]
y_val = val[y_col]

In [11]:
# Normalizing both train and test sets to have 0 mean and std. of 1 using the train set mean and std.
# This will give each feature an equal initial importance and speed up the training time
train_mean = X_train_continuous.mean(axis=0)
train_std = X_train_continuous.std(axis=0)

X_train_continuous = X_train_continuous - train_mean
X_train_continuous /= train_std

X_val_continuous = X_val_continuous - train_mean
X_val_continuous /= train_std

In [13]:
# Build a model using a categorical variable
class EmbeddingMapping():
    """
        Helper class for handling categorical variables
        An instance of this class should be defined for each categorical variable we want to use.
    """
    def __init__(self, series):
        # get a list of unique values
        values = series.unique().tolist()

        # Set a dictionary mapping from values to integer value
        # In our example this will be {'Development': 1, 'Old Operational': 2...}
        self.embedding_dict = {value: int_value + 1 for int_value, value in enumerate(values)}

        # The num_values will be used as the input_dim when defining the embedding layer. 
        # It will also be returned for unseen values 
        self.num_values = len(values) + 1
    
    def get_mapping(self, value):        
        # If the value was seen in the training set, return its integer mapping
        if value in self.embedding_dict:
            return self.embedding_dict[value]

        # Else, return the same integer for unseen values
        else:
            return self.num_values

In [14]:
# Create an embedding column for the train/validation sets
# todo: rewrite -> run a loop on `categorical_cols`. fix issue with name in pd.assign func.
    
cat_mapping = EmbeddingMapping(X_train_categorical['Priority'])
X_train_categorical = X_train_categorical.assign(Priority_mapping=X_train_categorical[
'Priority'].apply(cat_mapping.get_mapping))
X_val_categorical = X_val_categorical.assign(Priority_mapping=X_val_categorical[
'Priority'].apply(cat_mapping.get_mapping))

cat_mapping = EmbeddingMapping(X_train_categorical['RaisedByID'])
X_train_categorical = X_train_categorical.assign(RaisedByID_mapping=X_train_categorical[
'RaisedByID'].apply(cat_mapping.get_mapping))
X_val_categorical = X_val_categorical.assign(RaisedByID_mapping=X_val_categorical[
'RaisedByID'].apply(cat_mapping.get_mapping))

cat_mapping = EmbeddingMapping(X_train_categorical['AssignedToID'])
X_train_categorical = X_train_categorical.assign(AssignedToID_mapping=X_train_categorical[
'AssignedToID'].apply(cat_mapping.get_mapping))
X_val_categorical = X_val_categorical.assign(AssignedToID_mapping=X_val_categorical[
'AssignedToID'].apply(cat_mapping.get_mapping))

cat_mapping = EmbeddingMapping(X_train_categorical['AuthorisedByID'])
X_train_categorical = X_train_categorical.assign(AuthorisedByID_mapping=X_train_categorical[
'AuthorisedByID'].apply(cat_mapping.get_mapping))
X_val_categorical = X_val_categorical.assign(AuthorisedByID_mapping=X_val_categorical[
'AuthorisedByID'].apply(cat_mapping.get_mapping))

cat_mapping = EmbeddingMapping(X_train_categorical['StatusCode'])
X_train_categorical = X_train_categorical.assign(StatusCode_mapping=X_train_categorical[
'StatusCode'].apply(cat_mapping.get_mapping))
X_val_categorical = X_val_categorical.assign(StatusCode_mapping=X_val_categorical[
'StatusCode'].apply(cat_mapping.get_mapping))

cat_mapping = EmbeddingMapping(X_train_categorical['ProjectCode'])
X_train_categorical = X_train_categorical.assign(ProjectCode_mapping=X_train_categorical[
'ProjectCode'].apply(cat_mapping.get_mapping))
X_val_categorical = X_val_categorical.assign(ProjectCode_mapping=X_val_categorical[
'ProjectCode'].apply(cat_mapping.get_mapping))

cat_mapping = EmbeddingMapping(X_train_categorical['Category'])
X_train_categorical = X_train_categorical.assign(Category_mapping=X_train_categorical[
'Category'].apply(cat_mapping.get_mapping))
X_val_categorical = X_val_categorical.assign(Category_mapping=X_val_categorical[
'Category'].apply(cat_mapping.get_mapping))

cat_mapping = EmbeddingMapping(X_train_categorical['SubCategory'])
X_train_categorical = X_train_categorical.assign(SubCategory_mapping=X_train_categorical[
'SubCategory'].apply(cat_mapping.get_mapping))
X_val_categorical = X_val_categorical.assign(SubCategory_mapping=X_val_categorical[
'SubCategory'].apply(cat_mapping.get_mapping))

In [15]:
X_train_categorical.head()

Unnamed: 0,Priority,RaisedByID,AssignedToID,AuthorisedByID,StatusCode,ProjectCode,Category,SubCategory,Priority_mapping,RaisedByID_mapping,AssignedToID_mapping,AuthorisedByID_mapping,StatusCode_mapping,ProjectCode_mapping,Category_mapping,SubCategory_mapping
0,1,58,58,6.0,FINISHED,PC2,Development,Enhancement,1,1,1,1,1,1,1,1
1,1,58,42,6.0,FINISHED,PC2,Development,Enhancement,1,1,2,1,1,1,1,1
2,2,7,58,6.0,FINISHED,PC2,Operational,In House Support,2,2,1,1,1,1,2,2
3,5,50,42,6.0,FINISHED,PC2,Development,Bug,3,3,2,1,1,1,1,3
5,1,13,13,58.0,FINISHED,PC9,Development,Enhancement,1,4,3,2,1,2,1,1


In [16]:
X_train_continuous.head()

Unnamed: 0,HoursEstimate
0,0.139536
1,-0.10343
2,-0.322099
3,-0.322099
5,-0.10343


### define model & training process

In [84]:
# Define the input layers
# Define the embedding input

area_input = Input(shape=(X_train_categorical.shape[1] // 2,), dtype='int32') 
embeddings_output = 10
area_embedings = Embedding(output_dim=embeddings_output, input_dim=cat_mapping.num_values, input_length=X_train_categorical.shape[1] // 2)(area_input)
area_embedings = keras.layers.Reshape((embeddings_output,))(area_embedings)

# Define the continuous variables input (just like before)
continuous_input = Input(shape=(X_train_continuous.shape[1], ))

# Concatenate continuous and embeddings inputs
all_input = keras.layers.concatenate([continuous_input, area_embedings])

In [85]:
# We’ll define a simple model with 2 hidden layers, with 25 neurons each.
# Define the model

units=25
dense1 = Dense(units=units, activation='relu')(all_input)
dense2 = Dense(units, activation='relu')(dense1)
predictions = Dense(1)(dense2)
# Note using the input object 'area_input' not 'area_embeddings'
model = Model(inputs=[continuous_input, area_input], outputs=predictions)

In [86]:
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, 8)]          0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 8, 10)        250         input_25[0][0]                   
__________________________________________________________________________________________________
input_26 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
reshape_11 (Reshape)            (None, 10)           0           embedding_11[0][0]               
___________________________________________________________________________________________

In [87]:
X_train_categorical.shape, X_train_continuous.shape

((9820, 16), (9820, 1))

In [88]:
epochs = 10000
model.compile(loss=
'mse', optimizer=keras.optimizers.Adam(lr=.8, beta_1=0.9, beta_2=0.999, decay=1e-03, amsgrad=True))

# Note continuous and categorical columns are inserted in the same order as defined in all_inputs
history = model.fit([X_train_continuous, X_train_categorical['Priority_mapping'],
                    X_train_categorical['RaisedByID_mapping'], X_train_categorical['AssignedToID_mapping'],
                    X_train_categorical['AuthorisedByID_mapping'], X_train_categorical['AuthorisedByID_mapping'],
                    X_train_categorical['Category_mapping'], X_train_categorical['SubCategory_mapping']],
                    y_train, epochs=epochs, batch_size=128, callbacks=[periodic_logger_250],
                    validation_data=([X_val_continuous, X_val_categorical['Priority_mapping'],
                    X_val_categorical['RaisedByID_mapping'], X_val_categorical['AssignedToID_mapping'],
                    X_val_categorical['AuthorisedByID_mapping'], X_val_categorical['AuthorisedByID_mapping'],
                    X_val_categorical['Category_mapping'], X_val_categorical['SubCategory_mapping']], y_val),
                   verbose=1)

ValueError: Error when checking input: expected input_25 to have shape (8,) but got array with shape (1,)