In [1]:
import keras
from keras.layers import Dense, Flatten
from keras.layers import MaxPooling1D, Conv1D
from keras.models import Sequential
from keras.models import load_model
import pandas as pd
import numpy as np
from df_sequence import DfSequence
import constants as const

Using TensorFlow backend.


In [2]:
alphabet_size = const.ALPHABET_SIZE
max_length = const.MAX_LENGTH # Maximum number of characters in an input

kernel_width = 3 # Width of filter (Note: there is no height)
num_classes = 2 # Number of categories to classify

epochs = 4 # Number of times neural network will iterate over entire training set
batch_size = 100 # Number of inputs simultaneously fed into the network

input_shape = (max_length, alphabet_size)

Note: even though we are dealing with 1-dimensional data (text) the neural network receives 3-dimensional array of shape \[maximum_input_length, alphabet_size, batch_size\].

During data preparation step all characters are transliterated (if needed) and encoded as numbers.

Let's imagine is that our input is string 'Аба'.

So 'Аба' turns into 'aba' and then into \[1, 2, 1\] list. The same happens to labels company - 0, person - 1.

Just before being fed to neural networks input data is paded to maximum input length and one-hot encoded to avoid creating false relationships, 'a' may be encoded as 1 and 'b' as 2, but in no way 'a' is two times larger than 'b'.

So, after one-hot encoding \[1, 2, 1\] is turned into (let's imagine that maximum input length is 10 and alphabet size is 5, even though they are actually 150 and 29 )

\[ <br>
  \[1, 0, 0, 0, 0\],<br>
  \[0, 1, 0, 0, 0\],<br>
  \[1, 0, 0, 0, 0\],<br>
  \[0, 0, 0, 0, 0\],<br>
  \[0, 0, 0, 0, 0\],<br>
  \[0, 0, 0, 0, 0\],<br>
  \[0, 0, 0, 0, 0\],<br>
  \[0, 0, 0, 0, 0\],<br>
  \[0, 0, 0, 0, 0\],<br>
  \[0, 0, 0, 0, 0\]<br>
\]

The data is being fed into network in batches. The larger the batch the more memory it takes.

In [3]:
train_df = pd.read_pickle('../prepared_data/train_df_f.pkl') # Unpickle training set into Pandas dataframe
test_df = pd.read_pickle('../prepared_data/test_df_f.pkl') # Unpickle testing set into Pandas dataframe

In [4]:
# Here we create our model layer by layer.

model = Sequential() # Type of model suitable for less complicated models

model.add(Conv1D( # 1-dimensional convolution layer, meaning filter moves only in one dimension
                 32, # Number of filters, each filter creates on channel in the following layer
                 kernel_size=kernel_width, # Width of the filter, the length is automatically equal to alphabet_size
                 strides=1, # Number of characters per step
                 activation='relu', # Activation function ignores negative input and does not change positive
                 input_shape=input_shape
                )
         )
# Each filter stride outputs only one value,
# filter of width 3 can do 148 strides in list with length of 150.
# It means that after going through Conv1D layer 
# the alphabet size dimension is collapsed to 1 and maximum length dimension is 148.
# But there are 32 filters.
# So, the shape of output is [148, 1, 32] or just [148, 32]
# Keras calculates this automatically.

model.add(MaxPooling1D(pool_size=2))
# MaxPool1D decreases width of input by factor of 2 by selecting largest number from each pair.
# Now the shape is [74, 32]

model.add(Conv1D(64, kernel_size=kernel_width, activation='relu'))
# The same as previous Conv1D layer, but nownumber of filters is 64
# and each filter goes over all channels simultaneously.
# So the shape is [72, 64]
model.add(MaxPooling1D(pool_size=2))# Shape [36, 64]
model.add(Flatten()) # Flattens all values into one dimension. Shape 36X64=2304
model.add(Dense(1000, activation='relu'))
#Dense layer, meaning each neuron from previous layer is connected each meuron in dense leayer.
# In this case vua relu activation function.
model.add(Dense(num_classes, activation='softmax'))
#Dense layer with only two neurons.
# Softmax function normalizes outputs of all neurons
# to be between 0 and 1 and add up to 1.

model.summary() # Prints summary of the model. None is the batch size, which is unknown at the monent.

model.compile(loss=keras.losses.categorical_crossentropy, # Loss function for classification.
              optimizer=keras.optimizers.Adam(), # Magically makes model work better.
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 148, 32)           2816      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 74, 32)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 72, 64)            6208      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 36, 64)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              2305000   
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 2002      
Total para

In [5]:
# Put training and testing dataframes into generators that will perform padding and 
# one-hot encoding and will supply batches of data of specified shape.
train_sequence = DfSequence(train_df, batch_size, max_length, alphabet_size, num_classes)
test_sequence = DfSequence(test_df, batch_size, max_length, alphabet_size, num_classes)

In [6]:
# Training of the model
model.fit_generator(train_sequence,
                    epochs=epochs,
                    verbose=1,
                    validation_data=test_sequence)
# We should experimentally increase number of epochs, utnill
# both testing accuracy (val_acc) and training accuracy (acc) 
# are inreasing together. As soon as testing accuracy starts 
# decreasing, while training accuracy continue to increase,
# we should stop and use model from previous training epoch
# to avoid overfiting.

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fc6ea674dd8>

In [7]:
# Validate model with data never seen before data set.
validation_df = pd.read_pickle('../prepared_data/validation_df_f.pkl')
validation_sequence = DfSequence(validation_df, batch_size, max_length, alphabet_size, num_classes)

score = model.evaluate_generator(validation_sequence, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.03253412965392206
Test accuracy: 0.988556068515164


In [9]:
model.save('../models/company_person_kg.h5')