In [1]:
import tensorflow as tf

2024-08-11 13:01:26.564161: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd
import numpy as np

### Loading dataset

In [3]:
training_dataset = pd.read_csv('output/training_dataset_final.csv')

In [4]:
training_dataset

Unnamed: 0,Question,Similar/Dissimilar,Output
0,Where Do You Get Disability Insurance?,Is Long Term Disability Insurance The Same As ...,0
1,Does Each Tenant Need Renters Insurance?,Does Your Spouse Have To Be Your Beneficiary F...,0
2,Can Life Insurance Refuse To Pay?,How Much Is Enough Auto Insurance Coverage?,0
3,Does Long Term Care Insurance Cover Assisted L...,Is Long Term Care Insurance Regulated?,0
4,How Do I Choose A Medigap Plan?,Can You Change From Medicare Advantage To Medi...,0
...,...,...,...
195,What To Look For In A Good Health Insurance Plan?,What Qualifies As A Quality Health Insurance P...,1
196,Does Suze Orman Hate Whole Life Insurance?,Is Whole Life Insurance Something Suze Orman H...,1
197,How Much Does It Cost To Add A Named Driver To...,What Is The Price Of Adding A Named Driver To ...,1
198,Is Disability Insurance The Same As Workers Co...,Is Workers Compensation and Disability Insuran...,1


### Reshuffling our dataset

In [5]:
training_dataset = training_dataset.sample(frac=1, random_state=6).reset_index(drop=True)

In [6]:
training_dataset

Unnamed: 0,Question,Similar/Dissimilar,Output
0,How Much Can You Borrow From Your Life Insuran...,How much of your life insurance policy may you...,1
1,Can I Get Life Insurance If I Have Lung Cancer?,Can The IRS Go After Life Insurance Proceeds?,0
2,Does Medicaid Pay Part B Of Medicare?,Does Medicaid Cover Medicare Part B?,1
3,Does Health Insurance Increase With Age?,Does Age Affect the Cost of Health Insurance?,1
4,How Much Life Insurance Should You Carry?,What Level of Life Insurance Is Adequate?,1
...,...,...,...
195,Is Disability Insurance The Same As Workers Co...,Is Workers Compensation and Disability Insuran...,1
196,How Much To Budget For Homeowners Insurance?,What Is Good Renters Insurance?,0
197,Why Is Boys Car Insurance Higher?,Why Does Boy's Auto Insurance Cost More?,1
198,How Much Does Life Insurance Typically Cost?,How Much Is The Average Cost of Life Insurance?,1


### Text preprocess

In [7]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [8]:
# Example preprocessing function
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)  # Tokenize and remove punctuation

In [9]:
# Apply preprocessing to your dataset
training_dataset['Question_processed'] = training_dataset['Question'].apply(preprocess_text)
training_dataset['Similar/Dissimilar_processed'] = training_dataset['Similar/Dissimilar'].apply(preprocess_text)

In [10]:
training_dataset

Unnamed: 0,Question,Similar/Dissimilar,Output,Question_processed,Similar/Dissimilar_processed
0,How Much Can You Borrow From Your Life Insuran...,How much of your life insurance policy may you...,1,"[how, much, can, you, borrow, from, your, life...","[how, much, of, your, life, insurance, policy,..."
1,Can I Get Life Insurance If I Have Lung Cancer?,Can The IRS Go After Life Insurance Proceeds?,0,"[can, get, life, insurance, if, have, lung, ca...","[can, the, irs, go, after, life, insurance, pr..."
2,Does Medicaid Pay Part B Of Medicare?,Does Medicaid Cover Medicare Part B?,1,"[does, medicaid, pay, part, of, medicare]","[does, medicaid, cover, medicare, part]"
3,Does Health Insurance Increase With Age?,Does Age Affect the Cost of Health Insurance?,1,"[does, health, insurance, increase, with, age]","[does, age, affect, the, cost, of, health, ins..."
4,How Much Life Insurance Should You Carry?,What Level of Life Insurance Is Adequate?,1,"[how, much, life, insurance, should, you, carry]","[what, level, of, life, insurance, is, adequate]"
...,...,...,...,...,...
195,Is Disability Insurance The Same As Workers Co...,Is Workers Compensation and Disability Insuran...,1,"[is, disability, insurance, the, same, as, wor...","[is, workers, compensation, and, disability, i..."
196,How Much To Budget For Homeowners Insurance?,What Is Good Renters Insurance?,0,"[how, much, to, budget, for, homeowners, insur...","[what, is, good, renters, insurance]"
197,Why Is Boys Car Insurance Higher?,Why Does Boy's Auto Insurance Cost More?,1,"[why, is, boys, car, insurance, higher]","[why, does, boy, auto, insurance, cost, more]"
198,How Much Does Life Insurance Typically Cost?,How Much Is The Average Cost of Life Insurance?,1,"[how, much, does, life, insurance, typically, ...","[how, much, is, the, average, cost, of, life, ..."


### Load a Word2Vec Model

In [11]:
from gensim.models import KeyedVectors

In [12]:
# Load pre-trained Word2Vec model (Google's model for example)
# Download the model from https://code.google.com/archive/p/word2vec/
word2vec_model = KeyedVectors.load_word2vec_format('~/Desktop/pmf-projekat/GoogleNews-vectors-negative300.bin', binary=True)

### Transform Text to Vectors

In [13]:
def get_average_vector(tokens, model):
    # Get the average vector for the tokens
    vectors = [model[token] for token in tokens if token in model]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [14]:
# Apply to your dataset
training_dataset['Question_vector'] = training_dataset['Question_processed'].apply(lambda x: get_average_vector(x, word2vec_model))
training_dataset['Similar/Dissimilar_vector'] = training_dataset['Similar/Dissimilar_processed'].apply(lambda x: get_average_vector(x, word2vec_model))

# Convert vectors to a DataFrame
def vectors_to_dataframe(vectors):
    return pd.DataFrame(vectors.tolist())

# Add vector columns to the original DataFrame
question_vectors_df = vectors_to_dataframe(training_dataset['Question_vector'])
similar_dissimilar_vectors_df = vectors_to_dataframe(training_dataset['Similar/Dissimilar_vector'])

# Concatenate the original DataFrame with the vector columns
training_dataset_with_vectors = pd.concat([training_dataset, question_vectors_df, similar_dissimilar_vectors_df], axis=1)

# Drop the original text columns if no longer needed
training_dataset_with_vectors.drop(['Question', 'Similar/Dissimilar', 'Question_processed', 'Similar/Dissimilar_processed'], axis=1, inplace=True)

In [15]:
from sklearn.model_selection import StratifiedShuffleSplit

In [16]:
training_dataset_with_vectors

Unnamed: 0,Output,Question_vector,Similar/Dissimilar_vector,0,1,2,3,4,5,6,...,290,291,292,293,294,295,296,297,298,299
0,1,"[0.11574707, 0.03118372, 0.04538574, 0.2177246...","[0.11794705, 0.04127333, 0.039252385, 0.254340...",0.115747,0.031184,0.045386,0.217725,-0.043091,0.004211,0.106978,...,-0.149631,0.115262,-0.080648,-0.074626,-0.099013,0.033732,0.089410,-0.049642,-0.048876,-0.005995
1,0,"[-0.014434814, 0.047546387, 0.011581421, 0.126...","[0.075164795, 0.028930664, 0.0839386, 0.129486...",-0.014435,0.047546,0.011581,0.126465,-0.049133,0.015884,0.088886,...,-0.051834,0.132748,-0.114693,-0.035271,-0.047287,0.064919,0.055443,-0.091148,-0.023445,-0.080170
2,1,"[0.044830322, -0.022033691, 0.13494873, 0.2827...","[0.04151001, 0.0076171877, 0.15115967, 0.23105...",0.044830,-0.022034,0.134949,0.282715,0.049219,0.039331,0.105225,...,-0.067221,-0.009961,-0.105713,0.106250,-0.040649,0.139941,-0.039624,0.072021,0.010937,0.045605
3,1,"[0.03894043, 0.052286785, -0.020385742, 0.1126...","[0.06964983, 0.05988421, 0.031354632, 0.053048...",0.038940,0.052287,-0.020386,0.112630,-0.018921,-0.017415,0.085103,...,-0.011108,0.096592,-0.041112,-0.039830,-0.050947,0.181178,0.026079,-0.025321,0.069083,-0.032819
4,1,"[0.09109933, 0.050598145, 0.0584586, 0.1593191...","[-0.051274616, 0.0046564736, 0.055216473, 0.07...",0.091099,0.050598,0.058459,0.159319,-0.024030,0.011475,0.118931,...,-0.148356,0.043030,-0.100871,-0.006795,-0.118693,0.171387,-0.009450,0.043294,-0.051158,-0.037476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,"[0.07375717, -0.015579224, 0.008071899, 0.1252...","[0.09893417, -0.017501831, 0.02583313, 0.13865...",0.073757,-0.015579,0.008072,0.125252,0.039368,-0.026642,0.108185,...,-0.017151,0.088608,-0.079964,-0.032604,-0.081200,0.099742,-0.021505,0.065735,0.054850,-0.043747
196,0,"[0.175649, 0.037826538, 0.022176107, 0.1197102...","[0.10663452, -0.0136352535, 0.028833007, 0.076...",0.175649,0.037827,0.022176,0.119710,0.003438,-0.044271,0.145574,...,-0.047180,0.095654,-0.113672,-0.007666,-0.022070,0.115186,0.033636,-0.010937,0.112337,-0.116406
197,1,"[0.028518677, 0.016662598, 0.069234215, 0.0493...","[0.14561245, 0.024239676, 0.07048689, 0.083722...",0.028519,0.016663,0.069234,0.049316,-0.010813,-0.071086,0.046305,...,-0.083008,0.058027,-0.017473,-0.006522,-0.077488,0.137870,0.070112,0.025966,0.008624,-0.074428
198,1,"[0.13026646, 0.046805244, 0.008475168, 0.12974...","[0.08953476, 0.021514893, 0.022491455, 0.10862...",0.130266,0.046805,0.008475,0.129743,-0.022452,0.038413,0.160505,...,-0.174866,0.006378,-0.096291,-0.018188,-0.102364,0.142830,0.066513,0.004547,0.007454,-0.059052


In [17]:
# Keep only the relevant columns
training_dataset_for_siamese = training_dataset_with_vectors[['Question_vector', 'Similar/Dissimilar_vector', 'Output']]

# Now you have a simplified DataFrame with only the necessary columns
training_dataset_for_siamese

Unnamed: 0,Question_vector,Similar/Dissimilar_vector,Output
0,"[0.11574707, 0.03118372, 0.04538574, 0.2177246...","[0.11794705, 0.04127333, 0.039252385, 0.254340...",1
1,"[-0.014434814, 0.047546387, 0.011581421, 0.126...","[0.075164795, 0.028930664, 0.0839386, 0.129486...",0
2,"[0.044830322, -0.022033691, 0.13494873, 0.2827...","[0.04151001, 0.0076171877, 0.15115967, 0.23105...",1
3,"[0.03894043, 0.052286785, -0.020385742, 0.1126...","[0.06964983, 0.05988421, 0.031354632, 0.053048...",1
4,"[0.09109933, 0.050598145, 0.0584586, 0.1593191...","[-0.051274616, 0.0046564736, 0.055216473, 0.07...",1
...,...,...,...
195,"[0.07375717, -0.015579224, 0.008071899, 0.1252...","[0.09893417, -0.017501831, 0.02583313, 0.13865...",1
196,"[0.175649, 0.037826538, 0.022176107, 0.1197102...","[0.10663452, -0.0136352535, 0.028833007, 0.076...",0
197,"[0.028518677, 0.016662598, 0.069234215, 0.0493...","[0.14561245, 0.024239676, 0.07048689, 0.083722...",1
198,"[0.13026646, 0.046805244, 0.008475168, 0.12974...","[0.08953476, 0.021514893, 0.022491455, 0.10862...",1


In [18]:
training_dataset_for_siamese.iloc[150][1].shape

  training_dataset_for_siamese.iloc[150][1].shape


(300,)

### Modeling

In [19]:
from tensorflow.keras.layers import Input, Dense, Subtract, Multiply, Lambda
from tensorflow.keras.models import Model

### Define the Base Network

In [20]:
def create_base_network(input_shape):
    input = Input(shape=input_shape)
    x = Dense(128, activation='relu')(input)
    x = Dense(128, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    return Model(input, x)

### Define the Siamese Network

In [21]:
def create_siamese_network(input_shape):
    # Define the tensors for the two input vectors
    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape)
    
    # Create the base network to share weights
    base_network = create_base_network(input_shape)
    
    # Process the inputs through the base network
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    
    # Calculate the absolute difference between the processed vectors
    diff = Subtract()([processed_a, processed_b])
    
    # Calculate the product of the processed vectors
    prod = Multiply()([processed_a, processed_b])
    
    # Concatenate the difference and product
    concat = tf.keras.layers.Concatenate()([diff, prod])
    
    # Pass the concatenated vector through a dense layer
    x = Dense(128, activation='relu')(concat)
    x = Dense(128, activation='relu')(x)
    
    # Add a final sigmoid layer for the output
    output = Dense(1, activation='sigmoid')(x)
    
    # Create the Siamese network model
    siamese_network = Model(inputs=[input_a, input_b], outputs=output)
    return siamese_network

### Compile the Model

In [22]:
input_shape = (300,)  # Assuming the vectors are of length 300
siamese_network = create_siamese_network(input_shape)
siamese_network.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
siamese_network.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 model (Functional)             (None, 128)          71552       ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 subtract (Subtract)            (None, 128)          0           ['model[0][0]',            

2024-08-11 13:06:40.977352: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Prepare dataset for training

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
# Extract the input vectors and labels
question_vectors = np.array(training_dataset_for_siamese['Question_vector'].tolist())
similar_vectors = np.array(training_dataset_for_siamese['Similar/Dissimilar_vector'].tolist())
labels = np.array(training_dataset_for_siamese['Output'].tolist())

# Split the data into training and validation sets
X_train_q, X_val_q, X_train_s, X_val_s, y_train, y_val = train_test_split(
    question_vectors, similar_vectors, labels, test_size=0.2, random_state=42)

# Prepare the inputs for the model
X_train_1 = np.array(X_train_q)
X_train_2 = np.array(X_train_s)
X_val_1 = np.array(X_val_q)
X_val_2 = np.array(X_val_s)

### Train the model

In [27]:
siamese_network.fit([X_train_1, X_train_2], y_train, 
                    validation_data=([X_val_1, X_val_2], y_val), 
                    epochs=100, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x175826d30>

### Evaluate the Model

In [28]:
loss, accuracy = siamese_network.evaluate([X_val_1, X_val_2], y_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

Validation Loss: 1.9857566356658936
Validation Accuracy: 0.7749999761581421


### Using the model

In [29]:
def preprocess_question(question, word2vec_model):
    # Tokenize and preprocess the question (this assumes you used the same preprocessing during training)
    tokens = question.lower().split()  # Adjust this to match your preprocessing
    vector = np.mean([word2vec_model[token] for token in tokens if token in word2vec_model], axis=0)
    return vector

# Example questions
new_question_1 = "What is the capital of France?"
new_question_2 = "Which city is the capital of France?"

# Preprocess the questions to obtain their vector representations
new_question_vector_1 = preprocess_question(new_question_1, word2vec_model)
new_question_vector_2 = preprocess_question(new_question_2, word2vec_model)

# Reshape vectors to match the input shape expected by the model
new_question_vector_1 = np.expand_dims(new_question_vector_1, axis=0)
new_question_vector_2 = np.expand_dims(new_question_vector_2, axis=0)

# Make prediction
similarity_score = siamese_network.predict([new_question_vector_1, new_question_vector_2])

# Interpret the result
if similarity_score > 0.5:
    print("The questions are similar.")
else:
    print("The questions are dissimilar.")

The questions are similar.


In [30]:
similarity_score

array([[0.69653696]], dtype=float32)