In [2]:
 ! pip install faiss-cpu
 ! pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0
Defaulting to user installation because normal site-packages is not writeable


# Cleaning the titles of a csv file

In [3]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss
import numpy as np

# Load the reference dataset from a CSV file
reference_data = pd.read_csv('all_titles.csv')

# Create a DataFrame with specific columns from the reference dataset
# This DataFrame contains original titles and their cleaned versions
reference_df = pd.DataFrame(reference_data, columns=['Title', 'Cleaned_Title'])

# Load the dirty dataset from a CSV file
# This dataset contains titles that may need cleaning or standardization
dirty_data = pd.read_csv('Insurance.csv')

# Convert the 'Title' column to a list of strings from the dirty dataset
# These titles will be processed to find their cleaned versions
dirty_titles = dirty_data['Title'].astype(str).tolist()

# Initialize the sentence transformer model for encoding text
# This model converts titles into numerical vectors for comparison
encoder = SentenceTransformer("paraphrase-mpnet-base-v2")

# Encode the titles from the reference dataset into vectors
# These vectors represent the semantic meaning of the titles
vectors_reference = encoder.encode(reference_df['Title'].tolist())

# Get the dimensionality of the vectors
# This is needed to create a FAISS index with the correct dimensions
vector_dimension = vectors_reference.shape[1]

# Create a FAISS index for L2 distance (Euclidean distance) for the reference dataset
# This index allows efficient similarity searches between vectors
index_reference = faiss.IndexFlatL2(vector_dimension)

# Convert vectors to float32 for compatibility with FAISS, then normalize them
# Normalization ensures that the distance calculations are scale-invariant
vectors_reference = vectors_reference.astype(np.float32)
faiss.normalize_L2(vectors_reference)

# Add the normalized vectors to the FAISS index
# This step prepares the index for querying
index_reference.add(vectors_reference)

# Encode the titles from the dirty dataset into vectors
# Similar to the reference dataset, these vectors will be used for searching
vectors_dirty = encoder.encode(dirty_titles)

# Ensure the dirty vectors are of type float32 and normalize them
# This ensures compatibility and effectiveness of distance calculations
vectors_dirty = vectors_dirty.astype(np.float32)
faiss.normalize_L2(vectors_dirty)

# Number of nearest neighbors to find
# k=1 means we are looking for the closest match in the reference dataset
k = 1

# Perform the search for each title in the dirty dataset against the reference dataset
# This loop finds the closest reference title for each dirty title
ann_results = []
for vector in vectors_dirty:
    distances, indices = index_reference.search(vector.reshape(1, -1), k=k)
    ann_results.append((indices[0][0], distances[0][0]))

# Create a DataFrame with the ANN results
# This DataFrame contains indices of the closest matches and their distances
ann_df = pd.DataFrame(ann_results, columns=['ann', 'distance'])

# Add the 'Cleaned_Title' from the reference dataset to the ANN results
# This step retrieves the cleaned title for each dirty title based on the closest match
ann_df['Cleaned_Title'] = ann_df['ann'].apply(lambda idx: reference_df.iloc[idx]['Cleaned_Title'])

# Merge the ANN results with the original dirty DataFrame to get the titles
# This allows us to compare the original and cleaned titles side by side
merged_df = pd.merge(ann_df, dirty_data, left_index=True, right_index=True)

# Display the top results
# This shows the original titles, their cleaned versions, and the distance of the match
merged_df[['Title', 'Cleaned_Title', 'distance']].value_counts()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Title                                                  Cleaned_Title  distance    
Chief Executive Officer                                CEO            2.192104e-13    82
                                                                      2.098326e-13    60
CEO                                                    CEO            2.877439e-13    56
Chief Operating Officer                                COO            2.683821e-13    37
Founder                                                Founder        3.178171e-13    32
                                                                                      ..
Co-Founder and CEO                                     Co-Founder     2.483859e-02     1
Co-Founder and CFO                                     Co-Founder     6.762239e-02     1
Co-Founder and COO                                     Co-Founder     1.254324e-01     1
Co-Founder and CUO at Amphitrite Underwriting Limited  Co-Founder     0.000000e+00     1
☂️ Technical Co-Founder    

# Cleaning a Single Title with IndexFlatL2

In [4]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss
import numpy as np

# Load dataset from a CSV file
data = pd.read_csv('all_titles.csv')

In [19]:
# Create a DataFrame with specific columns from the dataset
df = pd.DataFrame(data, columns=['Title', 'Cleaned_Title'])

# Convert the 'Title' column to a list of strings
title = df['Title'].astype(str).tolist()

# Initialize the sentence transformer model for encoding text
# This model is used to convert text into numerical vectors
encoder = SentenceTransformer("paraphrase-mpnet-base-v2")

# Encode the titles into vectors
# Each title is converted into a high-dimensional vector
vectors = encoder.encode(title)

# Get the dimensionality of the vectors
# This is needed to create the FAISS index with the correct dimensions
vector_dimension = vectors.shape[1]

In [18]:

# Create a FAISS index for L2 distance (Euclidean distance)
# This index will allow us to perform efficient similarity searches
index = faiss.IndexFlatL2(vector_dimension)

# Convert vectors to float32 for compatibility with FAISS, then normalize them
# Normalization is important for distance calculations to be meaningful
vectors = vectors.astype(np.float32)
faiss.normalize_L2(vectors)

# Add the normalized vectors to the FAISS index
# This step is necessary before performing any searches
index.add(vectors)

# Define the search query
# This is the text we want to find similar titles for
search_text = 'Dimention director at XYZ'

# Encode the search query into a vector
# The query is also converted into a vector to perform the search
search_vector = encoder.encode([search_text])

# Ensure the search vector is of type float32 and normalize it
# Similar to the title vectors, the query vector is also normalized
search_vector = search_vector.astype(np.float32)
faiss.normalize_L2(search_vector.reshape(1, -1))

# Number of nearest neighbors to find
# This determines how many similar titles we want to retrieve
k = 3

# Perform the search
# This returns the distances and indices of the k nearest neighbors
distances, ann = index.search(search_vector, k=k)

# Create a DataFrame with the distances and indices of the nearest neighbors
# This DataFrame is used to display the results in a readable format
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})

# Merge the results with the original DataFrame to get the titles of the nearest neighbors
# This allows us to see the actual titles corresponding to the indices
merge = pd.merge(results, df, left_on='ann', right_index=True)

# Display the top results
# The results are sorted by distance, with the closest matches first
merge.head()

Unnamed: 0,distances,ann,Title,Cleaned_Title
0,1.081455,1592,"Senior Vice President, Creator Peripherals",Sr. VP of Creator Peripherals
1,1.119331,209,Division Operations Director,Division Operations Director
2,1.122915,1348,Director of DTC Operations,Director of DTC Operations


# Measuring and Comparing Speed 

## Measuring Speed with IndexFlatL2

In [7]:
%%time
D, I = index.search(search_vector, k)  # search
print(I)

[[ 780  572 1138 1133]]
CPU times: user 1.52 ms, sys: 17 µs, total: 1.54 ms
Wall time: 747 µs


## Speed with IndexIVFFlat (Faster, ~2000 training points )

In [17]:

nlist = 50  # how many cells
quantizer = faiss.IndexFlatL2(vector_dimension)
index = faiss.IndexIVFFlat(quantizer, vector_dimension, nlist)
index.train(vectors)  # train the index before adding vectors
index.add(vectors)
index.ntotal

k = 4

# Perform the search
distances, ann = index.search(search_vector, k=k)

# Create a DataFrame with the distances and indices of the nearest neighbors
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})

# Merge the results with the original DataFrame to get the titles of the nearest neighbors
merge = pd.merge(results, df, left_on='ann', right_index=True)

# Display the top results
merge.head()

Unnamed: 0,distances,ann,Title,Cleaned_Title
0,1.200172,169,Director of Data Science,Director of Data Science
1,1.231176,167,Director of Quantum Computing,Director of Quantum Computing
2,1.249123,172,Director of Nanotechnology,Director of Nanotechnology
3,1.277314,74,Chief Nanotechnology Officer,Chief Nanotechnology Officer


In [9]:
%%time
D, I = index.search(search_vector, k)  # search
print(I)

[[ 780  572 1138 1133]]
CPU times: user 0 ns, sys: 1.63 ms, total: 1.63 ms
Wall time: 1 ms


## Speed with IndexIVFPQ (fastest, with ~10000 training points)

In [10]:
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(vector_dimension)  # we keep the same L2 distance flat index
index = faiss.IndexIVFPQ(quantizer, vector_dimension, nlist, m, bits)
index.train(vectors)
index.add(vectors)

