In [1]:
import tensorflow as tf

2024-08-11 16:09:53.662708: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd
import numpy as np

### Loading dataset

In [3]:
training_dataset = pd.read_csv('preprocessing_output/training_dataset_final.csv')

In [4]:
training_dataset

Unnamed: 0,Question,Similar/Dissimilar,Output
0,Where Do You Get Disability Insurance?,Is Long Term Disability Insurance The Same As ...,0
1,Does Each Tenant Need Renters Insurance?,Does Your Spouse Have To Be Your Beneficiary F...,0
2,Can Life Insurance Refuse To Pay?,How Much Is Enough Auto Insurance Coverage?,0
3,Does Long Term Care Insurance Cover Assisted L...,Is Long Term Care Insurance Regulated?,0
4,How Do I Choose A Medigap Plan?,Can You Change From Medicare Advantage To Medi...,0
...,...,...,...
195,What To Look For In A Good Health Insurance Plan?,What Qualifies As A Quality Health Insurance P...,1
196,Does Suze Orman Hate Whole Life Insurance?,Is Whole Life Insurance Something Suze Orman H...,1
197,How Much Does It Cost To Add A Named Driver To...,What Is The Price Of Adding A Named Driver To ...,1
198,Is Disability Insurance The Same As Workers Co...,Is Workers Compensation and Disability Insuran...,1


### Reshuffling the dataset

In [5]:
training_dataset = training_dataset.sample(frac=1, random_state=6).reset_index(drop=True)

In [6]:
training_dataset

Unnamed: 0,Question,Similar/Dissimilar,Output
0,How Much Can You Borrow From Your Life Insuran...,How much of your life insurance policy may you...,1
1,Can I Get Life Insurance If I Have Lung Cancer?,Can The IRS Go After Life Insurance Proceeds?,0
2,Does Medicaid Pay Part B Of Medicare?,Does Medicaid Cover Medicare Part B?,1
3,Does Health Insurance Increase With Age?,Does Age Affect the Cost of Health Insurance?,1
4,How Much Life Insurance Should You Carry?,What Level of Life Insurance Is Adequate?,1
...,...,...,...
195,Is Disability Insurance The Same As Workers Co...,Is Workers Compensation and Disability Insuran...,1
196,How Much To Budget For Homeowners Insurance?,What Is Good Renters Insurance?,0
197,Why Is Boys Car Insurance Higher?,Why Does Boy's Auto Insurance Cost More?,1
198,How Much Does Life Insurance Typically Cost?,How Much Is The Average Cost of Life Insurance?,1


### Text preprocess

In [7]:
from gensim.models import Word2Vec # we will use word2vec to vectorize words
from gensim.utils import simple_preprocess # and simple preprocess before that to tokenize and remove punctuation

In [8]:
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)  # tokenize and remove punctuation

In [9]:
training_dataset['Question_processed'] = training_dataset['Question'].apply(preprocess_text)
training_dataset['Similar/Dissimilar_processed'] = training_dataset['Similar/Dissimilar'].apply(preprocess_text)

In [10]:
training_dataset

Unnamed: 0,Question,Similar/Dissimilar,Output,Question_processed,Similar/Dissimilar_processed
0,How Much Can You Borrow From Your Life Insuran...,How much of your life insurance policy may you...,1,"[how, much, can, you, borrow, from, your, life...","[how, much, of, your, life, insurance, policy,..."
1,Can I Get Life Insurance If I Have Lung Cancer?,Can The IRS Go After Life Insurance Proceeds?,0,"[can, get, life, insurance, if, have, lung, ca...","[can, the, irs, go, after, life, insurance, pr..."
2,Does Medicaid Pay Part B Of Medicare?,Does Medicaid Cover Medicare Part B?,1,"[does, medicaid, pay, part, of, medicare]","[does, medicaid, cover, medicare, part]"
3,Does Health Insurance Increase With Age?,Does Age Affect the Cost of Health Insurance?,1,"[does, health, insurance, increase, with, age]","[does, age, affect, the, cost, of, health, ins..."
4,How Much Life Insurance Should You Carry?,What Level of Life Insurance Is Adequate?,1,"[how, much, life, insurance, should, you, carry]","[what, level, of, life, insurance, is, adequate]"
...,...,...,...,...,...
195,Is Disability Insurance The Same As Workers Co...,Is Workers Compensation and Disability Insuran...,1,"[is, disability, insurance, the, same, as, wor...","[is, workers, compensation, and, disability, i..."
196,How Much To Budget For Homeowners Insurance?,What Is Good Renters Insurance?,0,"[how, much, to, budget, for, homeowners, insur...","[what, is, good, renters, insurance]"
197,Why Is Boys Car Insurance Higher?,Why Does Boy's Auto Insurance Cost More?,1,"[why, is, boys, car, insurance, higher]","[why, does, boy, auto, insurance, cost, more]"
198,How Much Does Life Insurance Typically Cost?,How Much Is The Average Cost of Life Insurance?,1,"[how, much, does, life, insurance, typically, ...","[how, much, is, the, average, cost, of, life, ..."


### Load a Word2Vec Model

In [11]:
from gensim.models import KeyedVectors

In [12]:
# load pre-trained Word2Vec model -> download from https://code.google.com/archive/p/word2vec/
word2vec_model = KeyedVectors.load_word2vec_format('~/DEV/envs/pmf-projekat/GoogleNews-vectors-negative300.bin', binary=True)

### Transform Text to Vectors

In [13]:
def get_average_vector(tokens, model): # function to vectorize words
    vectors = [model[token] for token in tokens if token in model]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [14]:
training_dataset['Question_vector'] = training_dataset['Question_processed'].apply(lambda x: get_average_vector(x, word2vec_model))
training_dataset['Similar/Dissimilar_vector'] = training_dataset['Similar/Dissimilar_processed'].apply(lambda x: get_average_vector(x, word2vec_model))

def vectors_to_dataframe(vectors): # we convert vectors to a pandas df
    return pd.DataFrame(vectors.tolist())

question_vectors_df = vectors_to_dataframe(training_dataset['Question_vector'])
similar_dissimilar_vectors_df = vectors_to_dataframe(training_dataset['Similar/Dissimilar_vector'])

training_dataset_with_vectors = pd.concat([training_dataset, question_vectors_df, similar_dissimilar_vectors_df], axis=1)

training_dataset_with_vectors.drop(['Question', 'Similar/Dissimilar', 'Question_processed', 'Similar/Dissimilar_processed'], axis=1, inplace=True)
training_dataset_for_siamese = training_dataset_with_vectors[['Question_vector', 'Similar/Dissimilar_vector', 'Output']]
training_dataset_for_siamese

Unnamed: 0,Question_vector,Similar/Dissimilar_vector,Output
0,"[0.11574707, 0.03118372, 0.04538574, 0.2177246...","[0.11794705, 0.04127333, 0.039252385, 0.254340...",1
1,"[-0.014434814, 0.047546387, 0.011581421, 0.126...","[0.075164795, 0.028930664, 0.0839386, 0.129486...",0
2,"[0.044830322, -0.022033691, 0.13494873, 0.2827...","[0.04151001, 0.0076171877, 0.15115967, 0.23105...",1
3,"[0.03894043, 0.052286785, -0.020385742, 0.1126...","[0.06964983, 0.05988421, 0.031354632, 0.053048...",1
4,"[0.09109933, 0.050598145, 0.0584586, 0.1593191...","[-0.051274616, 0.0046564736, 0.055216473, 0.07...",1
...,...,...,...
195,"[0.07375717, -0.015579224, 0.008071899, 0.1252...","[0.09893417, -0.017501831, 0.02583313, 0.13865...",1
196,"[0.175649, 0.037826538, 0.022176107, 0.1197102...","[0.10663452, -0.0136352535, 0.028833007, 0.076...",0
197,"[0.028518677, 0.016662598, 0.069234215, 0.0493...","[0.14561245, 0.024239676, 0.07048689, 0.083722...",1
198,"[0.13026646, 0.046805244, 0.008475168, 0.12974...","[0.08953476, 0.021514893, 0.022491455, 0.10862...",1


In [15]:
training_dataset_for_siamese.iloc[150][1].shape # we explore one item in our vectorized dataset

  training_dataset_for_siamese.iloc[150][1].shape # we explore one item in our vectorized dataset


(300,)

In [16]:
training_dataset_for_siamese.iloc[150][1]

  training_dataset_for_siamese.iloc[150][1]


array([ 6.79524764e-02,  5.03859520e-02,  2.80965175e-02,  4.95198555e-02,
        6.84102401e-02, -3.18400078e-02, -4.67936210e-02, -4.32942696e-02,
        1.38346359e-01,  7.63549805e-02,  9.47672501e-02,  5.49316406e-04,
       -1.12406410e-01,  7.46459961e-02, -8.94368514e-02, -1.74357090e-02,
        1.79819748e-01,  9.49401855e-02, -2.67130528e-02, -9.79668275e-02,
       -3.20383720e-02,  4.94384766e-02,  4.59187813e-02,  1.34684248e-02,
        9.81623307e-02,  5.12695312e-03, -4.60713692e-02,  6.81864396e-02,
       -9.19189453e-02, -8.04901123e-04, -1.27766924e-02,  9.01285838e-03,
       -1.92260742e-02, -8.63707885e-02, -3.01920567e-02,  2.47802734e-02,
        8.78601074e-02,  6.87662745e-03, -3.70279960e-02,  5.10253906e-02,
       -4.18294258e-02,  2.44954433e-02,  1.14176430e-01,  7.74943009e-02,
       -1.38468429e-01, -1.32573441e-01, -1.24944048e-02, -2.86661778e-02,
        3.83809395e-02, -6.25000000e-02,  7.95288086e-02, -3.14432792e-02,
       -1.10646568e-01,  

### Modeling

In [17]:
from tensorflow.keras.layers import Input, Dense, Subtract, Multiply, Lambda
from tensorflow.keras.models import Model, Sequential

### Define the Base Network

In [18]:
def create_base_network(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    return model

### Define the Siamese Network

In [19]:
def create_siamese_network(input_shape):
    input_a = Input(shape=input_shape) # tensor for the input vector a
    input_b = Input(shape=input_shape) # tensor for the input vector b
     
    base_network = create_base_network(input_shape) # base network which shares weights
    
    processed_a = base_network(input_a) # processing of input a through the base network
    processed_b = base_network(input_b) # processing of input b through the base network

    # now we need to capture the relationship between the two input vectors
    diff = Subtract()([processed_a, processed_b]) # element wise absolute difference between the processed vectors
    prod = Multiply()([processed_a, processed_b]) # element wise product of the processed vectors
    concat = tf.keras.layers.Concatenate()([diff, prod]) # concatenating the difference and product (to consider them both)
    
    final_network = Sequential() # defining the final part of the network
    final_network.add(Dense(128, activation='relu', input_shape=(concat.shape[-1],)))  # First dense layer
    final_network.add(Dense(128, activation='relu'))  # Second dense layer
    final_network.add(Dense(1, activation='sigmoid'))  # Final sigmoid layer for output

    output = final_network(concat) # output layer

    siamese_network = Model(inputs=[input_a, input_b], outputs=output) # creating the siamese network model
    return siamese_network

### Compile the Model

In [20]:
input_shape = (300,) # vector length of 300
siamese_network = create_siamese_network(input_shape)
siamese_network.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
siamese_network.summary()

2024-08-11 16:10:32.032574: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 sequential (Sequential)        (None, 128)          71552       ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 subtract (Subtract)            (None, 128)          0           ['sequential[0][0]',         

### Prepare dataset for training

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
question_vectors = np.array(training_dataset_for_siamese['Question_vector'].tolist()) # extract question vectors
similar_vectors = np.array(training_dataset_for_siamese['Similar/Dissimilar_vector'].tolist()) # extract similar/dissimilar vectors
labels = np.array(training_dataset_for_siamese['Output'].tolist()) # extract labels

X_train_q, X_val_q, X_train_s, X_val_s, y_train, y_val = train_test_split(
    question_vectors, similar_vectors, labels, test_size=0.2, random_state=33) # split the dataset into training and validation sets

X_train_1 = np.array(X_train_q)
X_train_2 = np.array(X_train_s)
X_val_1 = np.array(X_val_q)
X_val_2 = np.array(X_val_s)

### Train the model

In [23]:
siamese_network.fit([X_train_1, X_train_2], y_train, 
                    validation_data=([X_val_1, X_val_2], y_val), 
                    epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x177d59eb0>

### Evaluate the Model

In [24]:
loss, accuracy = siamese_network.evaluate([X_val_1, X_val_2], y_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

Validation Loss: 1.3609721660614014
Validation Accuracy: 0.75


### Save model weights

In [25]:
siamese_network.save('model/siamese_model.h5')

### Using the model

In [26]:
def preprocess_question(question, word2vec_model): # first we need to preprocess the questions
    tokens = question.lower().split()
    vector = np.mean([word2vec_model[token] for token in tokens if token in word2vec_model], axis=0)
    return vector

new_question_1 = "Why would I need insurance such as life?" # example question 1
new_question_2 = "What is the purpose of insuring life?" # example question 2

new_question_vector_1 = preprocess_question(new_question_1, word2vec_model)
new_question_vector_2 = preprocess_question(new_question_2, word2vec_model)

new_question_vector_1 = np.expand_dims(new_question_vector_1, axis=0) # reshape to match the input expected by the model
new_question_vector_2 = np.expand_dims(new_question_vector_2, axis=0) # reshape to match the input expected by the model

similarity_score = siamese_network.predict([new_question_vector_1, new_question_vector_2]) # make prediction

if similarity_score > 0.5:
    print("Questions are similar.")
else:
    print("Questions are dissimilar.")

Questions are similar.


In [27]:
similarity_score

array([[0.9907325]], dtype=float32)