This code prepares a model to classify a name as valid or invalid. Our target is to prepare a model which has a high precision in identifying an invalid name as per our demand where we cant take a chance with an invalid name being classified as valid.

In [2]:
import numpy as np 
import random 
import string
import functools
import pandas as pd
from sklearn.utils import shuffle
import re 
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

In [3]:
import tensorflow as tf
from keras.models import load_model
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers import Bidirectional
from keras.layers import Dropout
from keras.layers.normalization import BatchNormalization
import functools
from keras.preprocessing.text import Tokenizer
from keras.callbacks import Callback 
from keras.utils.np_utils import to_categorical
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
def classification_and_confusion_report(actual_label,predicted_label,threshold):
    predicted_label = np.where(predicted_label>threshold,1,0)
    report=classification_report(actual_label,predicted_label)
    cm = confusion_matrix(actual_label,predicted_label)
    print(cm)
    print(report)

The following is a preprocessing function where regex is used to deal with the given valid names in our dataset. 
All the individually occuring letters are deleted along with the spaces, dots and commas.


In [5]:
#Preprocessing names to have reduced variance in training teh n/w
def preprocessing(df):
    df["name"]=df["name"].str.lower()
    df["name"]=df["name"].apply(lambda x: re.sub(r"\s[a-z]\s", "", x))
    df["name"]=df["name"].apply(lambda x: re.sub(r"\.[a-z]\.", "", x))
    df["name"]=df["name"].apply(lambda x: re.sub(r"^[a-z]\.", "", x))
    df["name"]=df["name"].apply(lambda x: re.sub(r"^[a-z]\s", "", x))
    df["name"]=df["name"].apply(lambda x: re.sub(r"\s[a-z]$", "", x))
    df["name"]=df["name"].apply(lambda x: re.sub(r"\d", "", x))   #integer matching
    df["name"]=df["name"].apply(lambda x: x.replace(".",""))
    df["name"]=df["name"].apply(lambda x: x.replace(",",""))
    df["name"]=df["name"].apply(lambda x: x.strip())
    return df

The random_name_given_length() is the function to generate random names of a given length from available lower case characters in english dictionary along with a space and dot which are invalid. There is very less probability of having a randomly generated name being valid and hence all of them are tagged as invalid only. It is kept in mind that lengths of invalid names are generated in proportion equal to the one available in our valid_name dataset

In [6]:
def random_name_given_length(length):
    letters=list(string.ascii_lowercase+'.'+' ')
    return ''.join(random.choice(letters) for i in range(length))

Complete data is loaded with valid + invalid names.

In [7]:
def loadData(times):
    
    valid_data = pd.read_csv("names.csv")
    valid_data["is_valid"] = 1
    valid_data.dropna(inplace=True)

    #creating some invalid names for training purpose
    invalid_data = pd.DataFrame()
    for i in range(times):
        invalid_data_subset = valid_data["name"].apply(lambda x : random_name_given_length(len(x))).reset_index()
        invalid_data_subset["is_valid"] = 0    
        invalid_data = pd.concat([invalid_data,invalid_data_subset], ignore_index=True)
    
    #concatenating datasets
    data = pd.concat([valid_data,invalid_data], ignore_index=True)
    
    #preprocessing
    data = preprocessing(data)

    #Shuffling data 
    data = shuffle(data)    

    return data

In [8]:
def as_keras_metric(method):
    @functools.wraps(method)
    def wrapper(self, args, **kwargs):
        """ Wrapper for turning tensorflow metrics into keras metrics """
        value, update_op = method(self, args, **kwargs)
        K.get_session().run(tf.local_variables_initializer())
        with tf.control_dependencies([update_op]):
            value = tf.identity(value)
        return value
    return wrapper

Truncating length of names to length of 25 and equating MAX_SEQUENCE_LENGTH=25

In [9]:
MAX_SEQUENCE_LENGTH=25

def char2sequence(df,flag):
    
    if flag=='train':
        tokenizer.fit_on_texts(df.astype(str))
    
    df_features=tokenizer.texts_to_sequences(df.astype(str))
    df_features = pad_sequences(df_features, maxlen=MAX_SEQUENCE_LENGTH)
    word_index = tokenizer.word_index
    return df_features,word_index
    

Considering Gupta Mahima to be a valid name given Mahima Gupta to be valid in order to create more valid names, augmentation is done for the given dataset and only on the training part.
Also the complete dataset is divided into train and test set.

In [10]:
def get_train_test_array(data,test_fraction,augmentation):
    
    #import pdb;pdb.set_trace()

    # Lets split data here 
    msk = np.random.rand(len(data)) < test_fraction
    
    train=data[msk]
    test=data[~msk]
    
    #Augmenting valid data
    
    if augmentation:        
        data_augmented = pd.DataFrame()
        #Only training data can be used for augmenting
        data_augmented["name"]=train.loc[train["is_valid"]==1]["name"].apply(lambda x: ' '.join(random.sample(x.split(), len(x.split()))))
        data_augmented["is_valid"] = 1      
        train=train.append(data_augmented)
    
    train_features, word_index = char2sequence(train.name,'train')
    train_labels = np.array(train["is_valid"])

    test_features , word_index = char2sequence(test.name,'test')
    test_labels = np.array(test["is_valid"])


    return train_features,train_labels,test_features,test_labels,word_index

In [11]:
data = loadData(2)

word_index is the vocabulary of our train data

In [12]:
test_fraction=0.7
tokenizer = Tokenizer(num_words=MAX_SEQUENCE_LENGTH, char_level=True)
train_x,train_y,test_x,test_y,word_index = get_train_test_array(data,test_fraction,False)

In [15]:
train_x

array([[ 0,  0,  0, ...,  2, 10, 22],
       [ 0,  0,  0, ..., 10,  1,  3],
       [ 0,  0,  0, ...,  2,  1, 14],
       ...,
       [ 0,  0,  0, ..., 13,  4, 10],
       [ 0,  0,  0, ..., 22, 11, 10],
       [ 0,  0,  0, ..., 16,  3,  7]], dtype=int32)

In [29]:
train_x.shape,test_x.shape

((209485, 25), (89345, 25))

Embedding matrix is initialized as a 300 dimension embedding vector for each of the characters.

In [30]:
Embedding_vector_size=300
embeddings_index = {}
embedding_matrix = np.random.rand(len(word_index), Embedding_vector_size)

A character level LSTM model is built with embedding layer in the start having character embeddings learnt from the train data. The general use case is to use Batch Normalization between the linear and non-linear layers in your network, because it normalizes(rescales) the input to your activation function, so that you're centered in the linear section of the activation function (such as Sigmoid).
We should not apply dropout to output layer.

In [31]:
auc_roc = as_keras_metric(tf.metrics.auc)
K.clear_session()
model = Sequential()
model.add(Embedding(len(word_index),Embedding_vector_size,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH,trainable=True))
model.add(Dense(100,input_shape=(train_x.shape[1],1)))
model.add(Dropout(0.4))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(30, return_sequences=True)))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(30)))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[auc_roc])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 300)           17700     
_________________________________________________________________
dense_1 (Dense)              (None, 25, 100)           30100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 25, 100)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 25, 100)           400       
_________________________________________________________________
bidirectional_1 (Bidirection (None, 25, 60)            31440     
_________________________________________________________________
dropout_2 (Dropout)          (None, 25, 60)            0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 25, 60)            240       
__________

The shuffle parameter will shuffle our entire dataset (x, y and sample_weight together) first and 
then make batches according to the batch_size argument we passed to fit. 
'class_weight' argument treats every instance of class 0 as 2 instances of class 1 means that in our loss function we assign higher value to these instances. Hence, the loss becomes a weighted average, where the weight of each sample is specified by class_weight and its corresponding class.


In [32]:
class_weight = {0: 2.,
                1: 1.}

Auc is area under the ROC curve which is better when close to 1. An ROC curve (receiver operating characteristic curve) is a graph showing the performance of a classification model at all classification thresholds.

In [33]:
model.fit(train_x, train_y, validation_split=0.33, nb_epoch=2, batch_size=64,class_weight=class_weight,shuffle=True)

Train on 140354 samples, validate on 69131 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f14b90f64a8>

In [34]:
y_pred = model.predict(test_x)

Threshold is kept below 0.5 because we need a higher precision model with ability to precisely classify invalid names(negative class) as invalid.

In [35]:
classification_and_confusion_report(test_y,y_pred,0.3)

[[59031   409]
 [  814 29091]]
             precision    recall  f1-score   support

          0       0.99      0.99      0.99     59440
          1       0.99      0.97      0.98     29905

avg / total       0.99      0.99      0.99     89345



In [36]:
model.save('my_model_1.h5')  # creates a HDF5 file 'my_model.h5'
#model = load_model('my_model_1.h5')