In [1]:
import tensorflow as tf

2024-08-19 17:52:32.178118: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd
import numpy as np

### Loading dataset

In [3]:
training_dataset = pd.read_csv('preprocessing/training_dataset_final.csv')

In [4]:
training_dataset

Unnamed: 0,Question,Similar/Dissimilar,Output
0,Where Do You Get Disability Insurance?,Is Long Term Disability Insurance The Same As ...,0
1,Does Each Tenant Need Renters Insurance?,Does Your Spouse Have To Be Your Beneficiary F...,0
2,Can Life Insurance Refuse To Pay?,How Much Is Enough Auto Insurance Coverage?,0
3,Does Long Term Care Insurance Cover Assisted L...,Is Long Term Care Insurance Regulated?,0
4,How Do I Choose A Medigap Plan?,Can You Change From Medicare Advantage To Medi...,0
...,...,...,...
195,What To Look For In A Good Health Insurance Plan?,What Qualifies As A Quality Health Insurance P...,1
196,Does Suze Orman Hate Whole Life Insurance?,Is Whole Life Insurance Something Suze Orman H...,1
197,How Much Does It Cost To Add A Named Driver To...,What Is The Price Of Adding A Named Driver To ...,1
198,Is Disability Insurance The Same As Workers Co...,Is Workers Compensation and Disability Insuran...,1


### Reshuffling the dataset

In [5]:
training_dataset = training_dataset.sample(frac=1, random_state=6).reset_index(drop=True)

In [6]:
training_dataset

Unnamed: 0,Question,Similar/Dissimilar,Output
0,How Much Can You Borrow From Your Life Insuran...,How much of your life insurance policy may you...,1
1,Can I Get Life Insurance If I Have Lung Cancer?,Can The IRS Go After Life Insurance Proceeds?,0
2,Does Medicaid Pay Part B Of Medicare?,Does Medicaid Cover Medicare Part B?,1
3,Does Health Insurance Increase With Age?,Does Age Affect the Cost of Health Insurance?,1
4,How Much Life Insurance Should You Carry?,What Level of Life Insurance Is Adequate?,1
...,...,...,...
195,Is Disability Insurance The Same As Workers Co...,Is Workers Compensation and Disability Insuran...,1
196,How Much To Budget For Homeowners Insurance?,What Is Good Renters Insurance?,0
197,Why Is Boys Car Insurance Higher?,Why Does Boy's Auto Insurance Cost More?,1
198,How Much Does Life Insurance Typically Cost?,How Much Is The Average Cost of Life Insurance?,1


### Text preprocess

In [7]:
from gensim.utils import simple_preprocess # preprocess to tokenize and remove punctuation

In [8]:
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)  # tokenize and remove punctuation

In [9]:
training_dataset['Question_processed'] = training_dataset['Question'].apply(preprocess_text)
training_dataset['Similar/Dissimilar_processed'] = training_dataset['Similar/Dissimilar'].apply(preprocess_text)

In [10]:
training_dataset

Unnamed: 0,Question,Similar/Dissimilar,Output,Question_processed,Similar/Dissimilar_processed
0,How Much Can You Borrow From Your Life Insuran...,How much of your life insurance policy may you...,1,"[how, much, can, you, borrow, from, your, life...","[how, much, of, your, life, insurance, policy,..."
1,Can I Get Life Insurance If I Have Lung Cancer?,Can The IRS Go After Life Insurance Proceeds?,0,"[can, get, life, insurance, if, have, lung, ca...","[can, the, irs, go, after, life, insurance, pr..."
2,Does Medicaid Pay Part B Of Medicare?,Does Medicaid Cover Medicare Part B?,1,"[does, medicaid, pay, part, of, medicare]","[does, medicaid, cover, medicare, part]"
3,Does Health Insurance Increase With Age?,Does Age Affect the Cost of Health Insurance?,1,"[does, health, insurance, increase, with, age]","[does, age, affect, the, cost, of, health, ins..."
4,How Much Life Insurance Should You Carry?,What Level of Life Insurance Is Adequate?,1,"[how, much, life, insurance, should, you, carry]","[what, level, of, life, insurance, is, adequate]"
...,...,...,...,...,...
195,Is Disability Insurance The Same As Workers Co...,Is Workers Compensation and Disability Insuran...,1,"[is, disability, insurance, the, same, as, wor...","[is, workers, compensation, and, disability, i..."
196,How Much To Budget For Homeowners Insurance?,What Is Good Renters Insurance?,0,"[how, much, to, budget, for, homeowners, insur...","[what, is, good, renters, insurance]"
197,Why Is Boys Car Insurance Higher?,Why Does Boy's Auto Insurance Cost More?,1,"[why, is, boys, car, insurance, higher]","[why, does, boy, auto, insurance, cost, more]"
198,How Much Does Life Insurance Typically Cost?,How Much Is The Average Cost of Life Insurance?,1,"[how, much, does, life, insurance, typically, ...","[how, much, is, the, average, cost, of, life, ..."


### Load a Word2Vec Model

In [11]:
import torch
import torchtext

In [12]:
# For our project we are using pretrained GloVe 6B embeddings to convert text to vectors. Additionally, we are using torchtext as
# it enables automatic model download (there is no need for user to first download the embedding model and then import it -> 
# as everything is done through torchtext.)
# For more information please look at: https://nlp.stanford.edu/projects/glove/ 
glove = torchtext.vocab.GloVe(name="6B", dim=50) # load glove embeddings

### Transform Text to Vectors

In [13]:
def get_average_vector(tokens, glove): # function to vectorize words
    vectors = [glove[token] for token in tokens if token in glove.stoi]
    if len(vectors) == 0:
        return torch.zeros(glove.dim)
    return torch.mean(torch.stack(vectors), dim=0)

In [14]:
training_dataset['Question_vector'] = training_dataset['Question_processed'].apply(lambda x: get_average_vector(x, glove).numpy()) # apply function to training_dataset
training_dataset['Similar/Dissimilar_vector'] = training_dataset['Similar/Dissimilar_processed'].apply(lambda x: get_average_vector(x, glove).numpy()) # apply function to training_dataset

In [15]:
def vectors_to_dataframe(vectors): # convert vectors to dataframe
    return pd.DataFrame(vectors.tolist())

In [16]:
question_vectors_df = vectors_to_dataframe(training_dataset['Question_vector'])
similar_dissimilar_vectors_df = vectors_to_dataframe(training_dataset['Similar/Dissimilar_vector'])

training_dataset_with_vectors = pd.concat([training_dataset, question_vectors_df, similar_dissimilar_vectors_df], axis=1) # concat with training_dataset

training_dataset_with_vectors.drop(['Question_processed', 'Similar/Dissimilar_processed'], axis=1, inplace=True) # drop unnecessary columns

training_dataset_for_siamese = training_dataset_with_vectors[['Question_vector', 'Similar/Dissimilar_vector', 'Output']] # finalize

training_dataset_for_siamese # display result

Unnamed: 0,Question_vector,Similar/Dissimilar_vector,Output
0,"[0.28812653, 0.2274698, 0.19684939, -0.545717,...","[0.30790251, 0.2583558, 0.0973037, -0.49393648...",1
1,"[0.67015374, 0.26997185, 0.31989124, -0.147379...","[0.5123538, 0.0009334907, 0.13336125, -0.23812...",0
2,"[0.6226233, 0.16583666, 0.3080945, -0.54152125...","[0.44956, 0.064194, 0.31548738, -0.69372547, -...",1
3,"[0.12134666, 0.39845085, 0.18691649, -0.242108...","[0.34949002, 0.32901913, 0.05321388, -0.104091...",1
4,"[0.38764402, 0.14816228, 0.18771395, -0.411626...","[0.33797142, 0.42630515, -0.14697286, -0.19512...",1
...,...,...,...
195,"[0.26646873, 0.22621801, -0.04992786, -0.14544...","[0.24260385, 0.22555597, -0.07302366, -0.09749...",1
196,"[0.3980972, -0.031579573, 0.38514715, -0.37014...","[0.231132, 0.21420917, 0.17862001, -0.286443, ...",0
197,"[0.032586653, 0.24190617, 0.31103167, -0.26572...","[0.2854986, 0.06919169, 0.40158528, -0.1103902...",1
198,"[0.47669145, 0.19837439, 0.18423386, -0.328741...","[0.35991, 0.31689233, 0.10101666, -0.1893761, ...",1


In [17]:
training_dataset_for_siamese.iloc[150][1].shape # we explore one item in our vectorized dataset

  training_dataset_for_siamese.iloc[150][1].shape # we explore one item in our vectorized dataset


(50,)

In [18]:
training_dataset_for_siamese.iloc[150][1]

  training_dataset_for_siamese.iloc[150][1]


array([-0.17933333, -0.10147534,  0.18743402, -0.21102583,  0.01232166,
        0.6412217 , -0.47169337,  0.05470333, -0.08384169, -0.09460516,
        0.07728234, -0.09917001, -0.20543166, -0.55879587,  0.29825333,
       -0.16700651, -0.13086666, -0.38785684, -0.23403831, -0.15059249,
        0.08428668,  0.03945167, -0.22466166, -0.3223285 , -0.3710867 ,
       -1.678175  , -0.06977499, -0.6348116 , -0.0467625 ,  0.16582318,
        2.9077165 ,  0.5382917 ,  0.13232799, -0.195625  , -0.00819833,
       -0.21665515, -0.213857  , -0.04207051,  0.43313834, -0.56421834,
       -0.32458502, -0.01141934,  0.37367332,  0.5844733 , -0.10996483,
       -0.42139414, -0.44689813,  0.14413601,  0.23335999, -0.04724183],
      dtype=float32)

### Modeling

In [19]:
from tensorflow.keras.layers import Input, Dense, Subtract, Multiply, Lambda
from tensorflow.keras.models import Model, Sequential

### Define the Base Network

In [20]:
def create_base_network(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    return model

### Define the Siamese Network

In [21]:
# Siamese neural network for text similiarity inspired by: https://medium.com/@prabhnoor0212/siamese-network-keras-31a3a8f37d04
# Layers diff and prod added for complexity (helps neural network to determine whether two given vectors are similiar)
def create_siamese_network(input_shape):
    input_a = Input(shape=input_shape) # tensor for the input vector a
    input_b = Input(shape=input_shape) # tensor for the input vector b
     
    base_network = create_base_network(input_shape) # base network which shares weights
    
    processed_a = base_network(input_a) # processing of input a through the base network
    processed_b = base_network(input_b) # processing of input b through the base network

    # now we need to capture the relationship between the two input vectors
    diff = Subtract()([processed_a, processed_b]) # element wise absolute difference between the processed vectors
    prod = Multiply()([processed_a, processed_b]) # element wise product of the processed vectors
    concat = tf.keras.layers.Concatenate()([diff, prod]) # concatenating the difference and product (to consider them both)
    
    final_network = Sequential() # defining the final part of the network
    final_network.add(Dense(128, activation='relu', input_shape=(concat.shape[-1],)))  # dense layer
    final_network.add(Dense(128, activation='relu'))  # dense layer
    final_network.add(Dense(1, activation='sigmoid'))  # final sigmoid layer for output

    output = final_network(concat) # output layer

    siamese_network = Model(inputs=[input_a, input_b], outputs=output) # creating the siamese network model
    return siamese_network

### Compile the Model

In [22]:
input_shape = (50,) # vector length of 50 (since we are using torchtext.vocab.GloVe(name="6B", dim=50))
siamese_network = create_siamese_network(input_shape)
siamese_network.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
siamese_network.summary()

2024-08-19 17:52:48.100013: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 128)          39552       ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 subtract (Subtract)            (None, 128)          0           ['sequential[0][0]',         

### Prepare dataset for training

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
question_vectors = np.array(training_dataset_for_siamese['Question_vector'].tolist()) # extract question vectors
similar_vectors = np.array(training_dataset_for_siamese['Similar/Dissimilar_vector'].tolist()) # extract similar/dissimilar vectors
labels = np.array(training_dataset_for_siamese['Output'].tolist()) # extract labels

X_train_q, X_val_q, X_train_s, X_val_s, y_train, y_val = train_test_split(
    question_vectors, similar_vectors, labels, test_size=0.2, random_state=33) # split the dataset into training and validation sets

X_train_1 = np.array(X_train_q)
X_train_2 = np.array(X_train_s)
X_val_1 = np.array(X_val_q)
X_val_2 = np.array(X_val_s)

### Train the model

In [25]:
siamese_network.fit([X_train_1, X_train_2], y_train, 
                    validation_data=([X_val_1, X_val_2], y_val), 
                    epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x15f4fb130>

### Evaluate the Model

In [26]:
loss, accuracy = siamese_network.evaluate([X_val_1, X_val_2], y_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

Validation Loss: 0.8169072866439819
Validation Accuracy: 0.875


### Save model weights

In [27]:
siamese_network.save('model/siamese_model.h5')

### Using the model

In [28]:
def preprocess_question(question, glove): # first we need to preprocess the questions
    tokens = question.lower().split()
    vectors = [glove[token] for token in tokens if token in glove.stoi]
    if len(vectors) == 0:
        return torch.zeros(glove.dim)  # return zero vector if no tokens are in the vocabulary
    return torch.mean(torch.stack(vectors), dim=0)  # average the vectors

new_question_1 = "Why would I need insurance such as life?" # example question 1
new_question_2 = "What is the purpose of insuring life?" # example question 2

new_question_vector_1 = preprocess_question(new_question_1, glove)
new_question_vector_2 = preprocess_question(new_question_2, glove)

new_question_vector_1 = np.expand_dims(new_question_vector_1, axis=0) # reshape to match the input expected by the model
new_question_vector_2 = np.expand_dims(new_question_vector_2, axis=0) # reshape to match the input expected by the model

similarity_score = siamese_network.predict([new_question_vector_1, new_question_vector_2]) # make prediction

if similarity_score > 0.5:
    print("Questions are similar.")
else:
    print("Questions are dissimilar.")

Questions are similar.


In [29]:
similarity_score

array([[0.7041172]], dtype=float32)