In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
import tensorflow as tf 
import random
import gc
import re
import os
from datetime import datetime
# from tensorflow.python.platform import gfile
from tensorflow.io import gfile
# import tensorflow.compat.v1.gfile as gfile
from sklearn.model_selection import StratifiedKFold

try:
    import cPickle as pickle
except ImportError:
    import pickle
import collections

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Interim Data File Locations
interim_data = '../data/interim/'

In [14]:
df = pd.read_csv(interim_data + 'feature_engineering_results.csv')
df.shape

(87550, 28)

In [15]:
df.sample(10)

Unnamed: 0,a,b,target,name_a,name_b,partial,tkn_sort,tkn_set,sum_ipa,pshp_soundex_first,...,editex,saps,flexmetric,jaro,higueramico,sift4,eudex,aline,covington,phoneticeditdistance
46082,Roge,Care,0,roge,care,75,40,40,0.407258,0,...,0.375,0.0,0.275,0.5,0.266667,0.5,0.682353,0.4,0.647727,0.83871
19601,Bety,Wole,0,bety,wole,67,40,40,0.645161,0,...,0.25,0.0,0.225,0.0,0.1,0.25,0.868137,0.15,0.443182,0.850806
53813,Parmelia,Field,0,parmelia,field,44,27,27,0.728111,0,...,0.4375,0.0,0.26875,0.55,0.190476,0.25,0.780882,0.3125,0.468254,0.538306
20165,Patricia,Ricardo,0,patricia,ricardo,63,63,63,0.5,0,...,0.25,0.130435,0.3125,0.52381,0.27096,0.375,0.69951,0.4875,0.62,0.675403
19890,Onne,Fredricka,0,onne,fredricka,50,25,25,0.641129,0,...,0.166667,0.0,0.25,0.453704,0.0,0.111111,0.557843,0.127451,0.360656,0.419355
72047,Jen,Francois,0,jen,francois,57,17,17,0.858871,0,...,0.25,0.0,0.21875,0.486111,0.0,0.125,0.814706,0.240909,0.392157,0.330645
85441,Reg,Rich,0,reg,rich,57,29,29,0.919355,1,...,0.375,0.071429,0.425,0.527778,0.25,0.25,0.995098,0.4375,0.6,0.725806
78821,Phil,Lum,0,phil,lum,57,29,29,0.752688,0,...,0.125,0.0,0.175,0.0,0.016667,0.25,0.682843,0.291667,0.428571,0.596774
75534,Johannah,France,0,johannah,france,64,24,24,0.453405,0,...,0.375,0.0,0.4875,0.527778,0.232143,0.25,0.840686,0.318182,0.543478,0.637097
68161,Tulla,Slavi,0,tulla,slavi,69,50,50,0.634409,0,...,0.2,0.0,0.32,0.466667,0.238095,0.4,0.922059,0.555556,0.546296,0.735484


In [16]:
# TESTING TESTING TESTING
df = df.sample(100)

In [17]:
y = df.target
X = df.drop('target', axis=1)
print("Feature Set: ", X.shape)
print("Label Set: ", y.shape)

Feature Set:  (100, 27)
Label Set:  (100,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
print("Training Feature Set: ", X_train.shape)
print("Training Label Set: ", y_train.shape)
print("Testing Feature Set: ", X_test.shape)
print("Testing Label Set: ", y_test.shape)

Training Feature Set:  (80, 27)
Training Label Set:  (80,)
Testing Feature Set:  (20, 27)
Testing Label Set:  (20,)


### Base-Model 1: Exported TPOT Pipeline

In [19]:
def base_model_1(X_train, y_train, X_test, export=False):
    exported_pipeline = make_pipeline(
        MaxAbsScaler(),
        MinMaxScaler(),
        RandomForestClassifier(
            bootstrap=False,
            criterion="gini",
            max_features=0.25,
            min_samples_leaf=1,
            min_samples_split=4,
            n_estimators=100)
    )
    exported_pipeline.fit(X_train, y_train)
    if export==True:
        return exported_pipeline
    else:
        y_pred = exported_pipeline.predict_proba(X_test)
        return [p[1] for p in y_pred]

### Base-Model 2: Deep LSTM Siamese Network

In [20]:
# tensorflow based implementation of deep siamese LSTM network.
# Taken from https://github.com/dhwajraj/deep-siamese-text-similarity as of 2020-07-20
# and modified to fit hmni prediction pipeline
# deep-siamese-text-similarity original copyright:
#
# MIT License
#
# Copyright (c) 2016 Dhwaj Raj
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import tensorflow as tf


class SiameseLSTM(object):
    """
    A LSTM based deep Siamese network for text similarity.
    Uses an character embedding layer, followed by a biLSTM and Energy Loss layer.
    """

    def BiRNN(self, x, dropout, scope, hidden_units):
        n_hidden = hidden_units
        n_layers = 3

        # Prepare data shape to match `static_rnn` function requirements
        x = tf.unstack(tf.transpose(x, perm=[1, 0, 2]))

        # Define lstm cells with tensorflow
        # Forward direction cell
        with tf.name_scope('fw' + scope):
            with tf.compat.v1.variable_scope('fw' + scope):
                stacked_rnn_fw = []
                for _ in range(n_layers):
                    fw_cell = tf.compat.v1.nn.rnn_cell.LSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
                    lstm_fw_cell = \
                        tf.compat.v1.nn.rnn_cell.DropoutWrapper(fw_cell, output_keep_prob=dropout)
                    stacked_rnn_fw.append(lstm_fw_cell)
                lstm_fw_cell_m = \
                    tf.compat.v1.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_fw, state_is_tuple=True)

        with tf.name_scope('bw' + scope):
            with tf.compat.v1.variable_scope('bw' + scope):
                stacked_rnn_bw = []
                for _ in range(n_layers):
                    bw_cell = tf.compat.v1.nn.rnn_cell.LSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
                    lstm_bw_cell = \
                        tf.compat.v1.nn.rnn_cell.DropoutWrapper(bw_cell, output_keep_prob=dropout)
                    stacked_rnn_bw.append(lstm_bw_cell)
                lstm_bw_cell_m = \
                    tf.compat.v1.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_bw, state_is_tuple=True)

        # Get lstm cell output
        with tf.name_scope('bw' + scope):
            with tf.compat.v1.variable_scope('bw' + scope):
                (outputs, _, _) = \
                    tf.compat.v1.nn.static_bidirectional_rnn(lstm_fw_cell_m,
                                                   lstm_bw_cell_m, x, dtype=tf.float32)
        return outputs[-1]

    def contrastive_loss(self, y, d, batch_size):
        tmp = y * tf.square(d)
        tmp2 = (1 - y) * tf.square(tf.maximum(1 - d, 0))
        return tf.reduce_sum(tmp + tmp2) / batch_size / 2

    def __init__(self, sequence_length, vocab_size, embedding_size, hidden_units, batch_size):

        # Placeholders for input, output and dropout
        self.input_x1 = tf.compat.v1.placeholder(tf.int32, [None, sequence_length], name='input_x1')
        self.input_x2 = tf.compat.v1.placeholder(tf.int32, [None, sequence_length], name='input_x2')
        self.input_y = tf.compat.v1.placeholder(tf.float32, [None], name='input_y')
        self.dropout_keep_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_keep_prob')

        # Embedding layer
        with tf.name_scope('embedding'):
            self.W = tf.Variable(tf.compat.v1.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                                 trainable=True, name='W')
            self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1)

            self.embedded_chars2 = tf.nn.embedding_lookup(self.W, self.input_x2)

        # Create a convolution + maxpool layer for each filter size
        with tf.name_scope('output'):
            self.out1 = self.BiRNN(
                self.embedded_chars1,
                self.dropout_keep_prob,
                'side1',
                hidden_units
            )
            self.out2 = self.BiRNN(
                self.embedded_chars2,
                self.dropout_keep_prob,
                'side2',
                hidden_units
            )
            self.distance = \
                tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.out1, self.out2)), 1, keepdims=True))
            self.distance = tf.compat.v1.div(self.distance,
                                   tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.out1), 1, keepdims=True)),
                                          tf.sqrt(tf.reduce_sum(tf.square(self.out2), 1, keepdims=True))))
            self.distance = tf.reshape(self.distance, [-1], name='distance')
        with tf.name_scope('loss'):
            self.loss = self.contrastive_loss(self.input_y, self.distance, batch_size)

        # Accuracy computation is outside of this class.
        with tf.name_scope('accuracy'):
            self.temp_sim = tf.subtract(tf.ones_like(self.distance),
                                        tf.compat.v1.rint(self.distance), name='temp_sim')  # auto threshold 0.5
            correct_predictions = tf.equal(self.temp_sim, self.input_y)
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy')


In [21]:
# tensorflow based implementation of deep siamese LSTM network.
# Taken from https://github.com/dhwajraj/deep-siamese-text-similarity as of 2020-07-20
# and modified to fit hmni prediction pipeline
# deep-siamese-text-similarity original copyright:
#
# MIT License
#
# Copyright (c) 2016 Dhwaj Raj
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

TOKENIZER_RE = re.compile(r"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+", re.UNICODE)


def tokenizer(iterator):
    """Tokenizer generator.
    Args:
      iterator: Input iterator with strings.
    Yields:
      array of tokens per each value in the input.
    """
    for value in iterator:
        yield TOKENIZER_RE.findall(value)


class CategoricalVocabulary(object):
    """Categorical variables vocabulary class.
  Accumulates and provides mapping from classes to indexes.
  Can be easily used for words.
  """
    def __init__(self, unknown_token="<UNK>", support_reverse=True):
        self._unknown_token = unknown_token
        self._mapping = {unknown_token: 0}
        self._support_reverse = support_reverse
        if support_reverse:
            self._reverse_mapping = [unknown_token]
        self._freq = collections.defaultdict(int)
        self._freeze = False

    def __len__(self):
        """Returns total count of mappings. Including unknown token."""
        return len(self._mapping)

    def freeze(self, freeze=True):
        """Freezes the vocabulary, after which new words return unknown token id.
        Args:
          freeze: True to freeze, False to unfreeze.
        """
        self._freeze = freeze

    def get(self, category):
        """Returns word's id in the vocabulary.
        If category is new, creates a new id for it.
        Args:
          category: string or integer to lookup in vocabulary.
        Returns:
          interger, id in the vocabulary.
        """
        if category not in self._mapping:
            if self._freeze:
                return 0
            self._mapping[category] = len(self._mapping)
            if self._support_reverse:
                self._reverse_mapping.append(category)
        return self._mapping[category]

    def add(self, category, count=1):
        """Adds count of the category to the frequency table.
        Args:
          category: string or integer, category to add frequency to.
          count: optional integer, how many to add.
        """
        category_id = self.get(category)
        if category_id <= 0:
            return
        self._freq[category] += count

    def trim(self, min_frequency, max_frequency=-1):
        """Trims vocabulary for minimum frequency.
        Remaps ids from 1..n in sort frequency order.
        where n - number of elements left.
        Args:
          min_frequency: minimum frequency to keep.
          max_frequency: optional, maximum frequency to keep.
            Useful to remove very frequent categories (like stop words).
        """
        # Sort by alphabet then reversed frequency.
        self._freq = sorted(
            sorted(
                self._freq.items(),
                key=lambda x: (isinstance(x[0], str), x[0])),
            key=lambda x: x[1],
            reverse=True)
        self._mapping = {self._unknown_token: 0}
        if self._support_reverse:
            self._reverse_mapping = [self._unknown_token]
        idx = 1
        for category, count in self._freq:
            if 0 < max_frequency <= count:
                continue
            if count <= min_frequency:
                break
            self._mapping[category] = idx
            idx += 1
            if self._support_reverse:
                self._reverse_mapping.append(category)
        self._freq = dict(self._freq[:idx - 1])

    def reverse(self, class_id):
        """Given class id reverse to original class name.
        Args:
          class_id: Id of the class.
        Returns:
          Class name.
        Raises:
          ValueError: if this vocabulary wasn't initialized with support_reverse.
        """
        if not self._support_reverse:
            raise ValueError("This vocabulary wasn't initialized with "
                             "support_reverse to support reverse() function.")
        return self._reverse_mapping[class_id]


class VocabularyProcessor(object):
    """Maps documents to sequences of word ids."""

    def __init__(self,
                 max_document_length,
                 min_frequency=0,
                 vocabulary=None,
                 tokenizer_fn=None):
        """Initializes a VocabularyProcessor instance.
        Args:
          max_document_length: Maximum length of documents.
            if documents are longer, they will be trimmed, if shorter - padded.
          min_frequency: Minimum frequency of words in the vocabulary.
          vocabulary: CategoricalVocabulary object.
        Attributes:
          vocabulary_: CategoricalVocabulary object.
        """
        self.max_document_length = max_document_length
        self.min_frequency = min_frequency
        if vocabulary:
            self.vocabulary_ = vocabulary
        else:
            self.vocabulary_ = CategoricalVocabulary()
        if tokenizer_fn:
            self._tokenizer = tokenizer_fn
        else:
            self._tokenizer = tokenizer

    def fit(self, raw_documents):
        """Learn a vocabulary dictionary of all tokens in the raw documents.
        Args:
          raw_documents: An iterable which yield either str or unicode.
        Returns:
          self
        """
        for tokens in self._tokenizer(raw_documents):
            for token in tokens:
                self.vocabulary_.add(token)
        if self.min_frequency > 0:
            self.vocabulary_.trim(self.min_frequency)
        self.vocabulary_.freeze()
        return self

    def fit_transform(self, raw_documents):
        """Learn the vocabulary dictionary and return indexies of words.
        Args:
          raw_documents: An iterable which yield either str or unicode.
        Returns:
          x: iterable, [n_samples, max_document_length]. Word-id matrix.
        """
        self.fit(raw_documents)
        return self.transform(raw_documents)

    def transform(self, raw_documents):
        """Transform documents to word-id matrix.
        Convert words to ids with vocabulary fitted with fit or the one
        provided in the constructor.
        Args:
          raw_documents: An iterable which yield either str or unicode.
        Yields:
          x: iterable, [n_samples, max_document_length]. Word-id matrix.
        """
        for tokens in self._tokenizer(raw_documents):
            word_ids = np.zeros(self.max_document_length, np.int64)
            for idx, token in enumerate(tokens):
                if idx >= self.max_document_length:
                    break
                word_ids[idx] = self.vocabulary_.get(token)
            yield word_ids

    def save(self, filename):
        """Saves vocabulary processor into given file.
        Args:
          filename: Path to output file.
        """
        with gfile.Open(filename, 'wb') as f:
            f.write(pickle.dumps(self))

    @classmethod
    def restore(cls, filename):
        """Restores vocabulary processor from given file.
        Args:
          filename: Path to file to load from.
        Returns:
          VocabularyProcessor object.
        """
        with gfile.Open(filename, 'rb') as f:
            return pickle.loads(f.read())


def tokenizer_char(iterator):
    for value in iterator:
        yield list(value)


class MyVocabularyProcessor(VocabularyProcessor):
    def __init__(self, max_document_length, min_frequency=0, vocabulary=None):
        super().__init__(max_document_length, min_frequency, vocabulary)
        sup = super(MyVocabularyProcessor, self)
        sup.__init__(max_document_length, min_frequency, vocabulary,
                     tokenizer_char)

    def transform(self, raw_documents):
        """Transform documents to word-id matrix.
        Convert words to ids with vocabulary fitted with fit or the one
        provided in the constructor.
        Args:
          raw_documents: An iterable which yield either str or unicode.
        Yields:
          x: iterable, [n_samples, max_document_length]. Word-id matrix.
        """
        for tokens in self._tokenizer(raw_documents):
            word_ids = np.zeros(self.max_document_length, np.int64)
            for (idx, token) in enumerate(tokens):
                if idx >= self.max_document_length:
                    break
                word_ids[idx] = self.vocabulary_.get(token)
            yield word_ids


In [22]:
def base_model_2(X_train, y_train, X_test, export=False):
    
    # Train Model
    embedding_dim = 300  # Dimensionality of character embedding
    dropout_keep_prob = 0.8  # Dropout keep probability
    hidden_units = 50
    batch_size = 64
    num_epochs = 300  # Number of training epochs
    evaluate_every = 1000  # Evaluate model on dev set after this many steps
    max_document_length = 15
    out_dir = os.getcwd()+'\\'  # where to save exported models

    inpH = InputHelper()
    train_set, dev_set, vocab_processor, sum_no_of_batches = \
        inpH.get_datasets(
        X_train[['name_a', 'name_b']],
        y_train,
        max_document_length=max_document_length,
        percent_dev=10,
        batch_size=64)


    # print('starting graph def')
    graph = tf.Graph()
    with tf.Graph().as_default():
        session_conf = tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.compat.v1.Session(config=session_conf)
        # print('started session')
        with sess.as_default():
            siameseModel = SiameseLSTM(
                sequence_length=max_document_length,
                vocab_size=len(vocab_processor.vocabulary_),
                embedding_size=embedding_dim,
                hidden_units=hidden_units,
                batch_size=batch_size,
            )

            # Define Training procedure
            global_step = tf.Variable(0, name='global_step', trainable=False)
            # optimizer = tf.optimizers.Adam(1e-3)
            # optimizer = Adam(1e-3)
            optimizer = tf.compat.v1.train.AdamOptimizer(1e-3)
            # print('initialized siameseModel object')

        grads_and_vars = optimizer.compute_gradients(siameseModel.loss)
        tr_op_set = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        # print('defined training_ops')
        
        if export==True:
            saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=100)
            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, 'vocab'))

        # Initialize all variables
        sess.run(tf.compat.v1.global_variables_initializer())

        def train_step(x1_batch, x2_batch, y_batch):
            # A single training step
            if random.random() > 0.5:
                feed_dict = {
                    siameseModel.input_x1: x1_batch,
                    siameseModel.input_x2: x2_batch,
                    siameseModel.input_y: y_batch,
                    siameseModel.dropout_keep_prob: dropout_keep_prob,
                }
            else:
                feed_dict = {
                    siameseModel.input_x1: x2_batch,
                    siameseModel.input_x2: x1_batch,
                    siameseModel.input_y: y_batch,
                    siameseModel.dropout_keep_prob: dropout_keep_prob,
                }
            (_, step, loss, accuracy, dist, sim) = \
                sess.run([tr_op_set, global_step, siameseModel.loss, siameseModel.accuracy,
                          siameseModel.distance, siameseModel.temp_sim], feed_dict)

        def dev_step(x1_batch, x2_batch, y_batch):
            # A single training step
            if random.random() > 0.5:
                feed_dict = {
                    siameseModel.input_x1: x1_batch,
                    siameseModel.input_x2: x2_batch,
                    siameseModel.input_y: y_batch,
                    siameseModel.dropout_keep_prob: 1.0,
                }
            else:
                feed_dict = {
                    siameseModel.input_x1: x2_batch,
                    siameseModel.input_x2: x1_batch,
                    siameseModel.input_y: y_batch,
                    siameseModel.dropout_keep_prob: 1.0,
                }
            (step, loss, accuracy, sim) = \
                sess.run([global_step, siameseModel.loss, siameseModel.accuracy,
                          siameseModel.temp_sim], feed_dict)
            return accuracy

        # Generate batches
        batches = inpH.batch_iter(list(zip(train_set[0], train_set[1],
                                           train_set[2])), batch_size, num_epochs)
        max_validation_acc = 0.0
        for nn in range(sum_no_of_batches * num_epochs):
            batch = next(batches)
            if len(batch) < 1:
                continue
            (x1_batch, x2_batch, y_batch) = zip(*batch)
            if len(y_batch) < 1:
                continue
            train_step(x1_batch, x2_batch, y_batch)
            current_step = tf.compat.v1.train.global_step(sess, global_step)
            sum_acc = 0.0
            if current_step % evaluate_every == 0:
                dev_batches = inpH.batch_iter(list(zip(dev_set[0], dev_set[1], dev_set[2])), batch_size, 1)
                for db in dev_batches:
                    if len(db) < 1:
                        continue
                    (x1_dev_b, x2_dev_b, y_dev_b) = zip(*db)
                    if len(y_dev_b) < 1:
                        continue
                    acc = dev_step(x1_dev_b, x2_dev_b, y_dev_b)
                    sum_acc = sum_acc + acc
            if sum_acc > max_validation_acc:
                max_validation_acc = sum_acc
            
                if export==True:
                    # save model
                    saver.save(sess, out_dir, global_step=current_step)
                    tf.train.write_graph(sess.graph.as_graph_def(), out_dir, 'siamese_network.pb', as_text=False)
                
                # print('model {} with sum_accuracy={}'.format(nn, max_validation_acc))     
        if export==True:
            return
        
        # RUN OOF INFERENCE
        x1_temp= np.asarray(X_test['name_a'].tolist())
        x2_temp= np.asarray(X_test['name_b'].tolist())
        
        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))

        (predictions, sim) = sess.run([siameseModel.distance, siameseModel.temp_sim], {
                siameseModel.input_x1: x1,
                siameseModel.input_x2: x2,
                siameseModel.dropout_keep_prob: 1.0,
            })
                
        sim = predictions.tolist()
        sim = [1-x for x in sim]
        # print(sim)
        return sim

In [23]:
# tensorflow based implementation of deep siamese LSTM network.
# Taken from https://github.com/dhwajraj/deep-siamese-text-similarity as of 2020-07-20
# and modified to fit hmni prediction pipeline
# deep-siamese-text-similarity original copyright:
#
# MIT License
#
# Copyright (c) 2016 Dhwaj Raj
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

class InputHelper(object):
    vocab_processor = None

    def batch_iter(
            self,
            data,
            batch_size,
            num_epochs,
            shuffle=True,
    ):

        # Generates a batch iterator for a dataset.
        data = np.asarray(data)
        data_size = len(data)
        num_batches_per_epoch = int(len(data) / batch_size) + 1
        for epoch in range(num_epochs):

            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = \
                    np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
            else:
                shuffled_data = data
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                yield shuffled_data[start_index:end_index]

    # Data Preparation
    def get_datasets(
            self,
            X_train,
            y_train,
            max_document_length,
            percent_dev,
            batch_size,
    ):
        (x1_text, x2_text, y) = \
            np.asarray(X_train.iloc[:, 0].str.lower()), np.asarray(X_train.iloc[:, 1].str.lower()), np.asarray(y_train)

        # Build vocabulary
        # print('Building vocabulary')
        vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0)
        vocab_processor.fit_transform(np.concatenate((x2_text, x1_text), axis=0))
        # print('Length of loaded vocabulary ={}'.format(len(vocab_processor.vocabulary_)))

        sum_no_of_batches = 0
        x1 = np.asarray(list(vocab_processor.transform(x1_text)))
        x2 = np.asarray(list(vocab_processor.transform(x2_text)))

        # Randomly shuffle data
        np.random.seed(131)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x1_shuffled = x1[shuffle_indices]
        x2_shuffled = x2[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        dev_idx = -1 * len(y_shuffled) * percent_dev // 100
        del x1
        del x2

        # TODO: This is very crude, should use cross-validation
        (x1_train, x1_dev) = (x1_shuffled[:dev_idx], x1_shuffled[dev_idx:])
        (x2_train, x2_dev) = (x2_shuffled[:dev_idx], x2_shuffled[dev_idx:])
        (y_train, y_dev) = (y_shuffled[:dev_idx], y_shuffled[dev_idx:])
        # print('Train/Dev split for data: {:d}/{:d}'.format(len(y_train), len(y_dev)))

        sum_no_of_batches = sum_no_of_batches + len(y_train) // batch_size
        train_set = (x1_train, x2_train, y_train)
        dev_set = (x1_dev, x2_dev, y_dev)
        gc.collect()
        return train_set, dev_set, vocab_processor, sum_no_of_batches

    def getTestDataSet(
            self,
            X_test,
            y_test,
            vocab,
            max_document_length,
    ):
        (x1_temp, x2_temp, y) = np.asarray(X_test.iloc[:, 0].str.lower()), np.asarray(
            X_test.iloc[:, 1].str.lower()), np.asarray(y_test)

        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0)
        vocab_processor = vocab

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))

        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1, x2, y


In [None]:
# Stratified K-Folds cross-validator
meta_training = pd.DataFrame()

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

fold = 1
for train_index, test_index in stratified_kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    oof_pred = X_test[['name_a', 'name_b']]
    
    oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
                                      y_train,
                                      X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))

    oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
                                      y_train,
                                      X_test[['name_a', 'name_b']])
    
    oof_pred['target'] = y_test.tolist()
    
    print('completed fold {} of 10'.format(fold))
    fold += 1

    meta_training = meta_training.append(oof_pred)

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell, unroll=True))`, which is equivalent to this API


  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),


Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


  data = np.asarray(data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['target'] = y_test.tolist()
  meta_training = meta_training.append(oof_pred)
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

completed fold 1 of 10


  data = np.asarray(data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['target'] = y_test.tolist()
  meta_training = meta_training.append(oof_pred)
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

completed fold 2 of 10


  data = np.asarray(data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['target'] = y_test.tolist()
  meta_training = meta_training.append(oof_pred)
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

completed fold 3 of 10


  data = np.asarray(data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['target'] = y_test.tolist()
  meta_training = meta_training.append(oof_pred)
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

completed fold 4 of 10


  data = np.asarray(data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['target'] = y_test.tolist()
  meta_training = meta_training.append(oof_pred)
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

completed fold 5 of 10


  data = np.asarray(data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['target'] = y_test.tolist()
  meta_training = meta_training.append(oof_pred)
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

completed fold 6 of 10


  data = np.asarray(data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['target'] = y_test.tolist()
  meta_training = meta_training.append(oof_pred)
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

completed fold 7 of 10


  data = np.asarray(data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['target'] = y_test.tolist()
  meta_training = meta_training.append(oof_pred)
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

completed fold 8 of 10


  data = np.asarray(data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oof_pred['target'] = y_test.tolist()
  meta_training = meta_training.append(oof_pred)
  oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1),
  X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

completed fold 9 of 10


In [13]:
meta_training.sample(10)

Unnamed: 0,name_a,name_b,predict_proba,siamese_sim,target
44178,jacob,toncka,0.01,0.011047,0
44402,david,josefina,0.0,0.150563,0
9221,ljuba,ljubisa,0.99,0.842422,1
61881,augustina,sigridur,0.021667,0.525438,0
40396,ruby,wen,0.06,0.238432,0
53560,migue,fran,0.0,0.008486,0
2465,catherine,katherina,0.973333,0.960835,1
59614,ance,titia,0.091667,0.088846,0
8270,kori,chucky,0.043333,0.767349,1
20973,kori,yarik,0.208333,0.04735,0


In [14]:
meta_training[meta_training.name_a=='lavinia']

Unnamed: 0,name_a,name_b,predict_proba,siamese_sim,target
8482,lavinia,ina,0.57,0.080234,1
8483,lavinia,lavina,1.0,0.547983,1
8485,lavinia,louvinia,0.991667,0.77187,1
8486,lavinia,vina,0.973333,0.085932,1
8487,lavinia,vine,0.738333,0.09236,1
8484,lavinia,louvina,0.966667,0.826876,1
8488,lavinia,vinne,0.838333,0.716403,1
8489,lavinia,wyncha,0.0,0.197308,1
25631,lavinia,jonna,0.033333,0.073134,0
55876,lavinia,patricia,0.071667,0.195781,0


In [7]:
import hmni
matcher = hmni.Matcher(model='latin')

AttributeError: module 'scipy' has no attribute '_lib'

In [10]:
matcher.similarity('kizza', 'wally')

0

In [11]:
matcher.similarity('lavinia', 'lavina')

AttributeError: 'MinMaxScaler' object has no attribute 'clip'

In [12]:
matcher.similarity('bourque', 'bork')

AttributeError: 'MinMaxScaler' object has no attribute 'clip'