In [49]:
 ! pip install faiss-cpu
 ! pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


The **SentenceTransformer** models **"paraphrase-mpnet-base-v2"** and **"all-MiniLM-L6-v2"** are both pre-trained models for generating sentence embeddings, but they differ in architecture and training data:

**paraphrase-mpnet-base-v2**: This model is based on the MPNet architecture, which is designed for pre-training language models using a self-supervised learning objective that combines masked language modeling and permutation language modeling. It is fine-tuned specifically for paraphrasing tasks, which means it is trained to understand sentences that have the same meaning but are expressed differently.

**all-MiniLM-L6-v2**: This model is a smaller version of the larger MiniLM models, with only 6 layers. It is designed to be more efficient while still providing high-quality sentence embeddings. It is trained on a variety of tasks and datasets, making it a more general-purpose model.

For working with titles, which are typically short and concise, the choice between the two models may depend on the specific requirements of your task:

If you need embeddings that are more sensitive to paraphrasing and semantic similarity, "paraphrase-mpnet-base-v2" might be more suitable.
If efficiency is a concern and you want a model that is faster and lighter but still performs well on a range of tasks, "all-MiniLM-L6-v2" could be a better choice.

# Cleaning the titles of a csv file

In [50]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss
import numpy as np

reference_data = pd.read_csv('final_titles.csv')
reference_df = pd.DataFrame(reference_data, columns=['Title', 'Cleaned_Title'])

dirty_data = pd.read_csv('test_data.csv')
dirty_titles = dirty_data['Title'].astype(str).tolist()

encoder = SentenceTransformer("all-MiniLM-L6-v2")
vectors_reference = encoder.encode(reference_df['Title'].tolist())
vector_dimension = vectors_reference.shape[1]
index_reference = faiss.IndexFlatL2(vector_dimension)

vectors_reference = vectors_reference.astype(np.float32)
faiss.normalize_L2(vectors_reference)
index_reference.add(vectors_reference)

vectors_dirty = encoder.encode(dirty_titles)
vectors_dirty = vectors_dirty.astype(np.float32)
faiss.normalize_L2(vectors_dirty)

k = 1
ann_results = []
for vector in vectors_dirty:
    distances, indices = index_reference.search(vector.reshape(1, -1), k=k)
    similarity_scores = 1 - distances
    ann_results.append((indices[0][0], similarity_scores[0][0]))

ann_df = pd.DataFrame(ann_results, columns=['ann', 'similarity_score'])
ann_df['Cleaned_Title'] = ann_df['ann'].apply(lambda idx: reference_df.iloc[idx]['Cleaned_Title'])

merged_df = pd.merge(ann_df, dirty_data, left_index=True, right_index=True)
merged_df[['Title', 'Cleaned_Title', 'similarity_score']]

Unnamed: 0,Title,Cleaned_Title,similarity_score
0,Business Owner,Owner,0.578457
1,Small Business Owner,Owner,0.260696
2,Owner,Owner,1.000000
3,Creative Director,Creative Director,1.000000
4,"Director, HR Planning and Operations",Director of HR Planning and Operations,1.000000
...,...,...,...
63,Event Director,Event Director,1.000000
64,Director of Project Management,Director of Project Management,1.000000
65,Director of Sales,Director of Sales,1.000000
66,Founder and Director,Founder,0.977466


# Cleaning a Single Title with IndexFlatL2

In [51]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss
import numpy as np

data = pd.read_csv('final_titles.csv')
df = pd.DataFrame(data, columns=['Title', 'Cleaned_Title'])
title = df['Title'].astype(str).tolist()
encoder = SentenceTransformer("all-MiniLM-L6-v2")
vectors = encoder.encode(title)
vector_dimension = vectors.shape[1]

In [52]:

index = faiss.IndexFlatL2(vector_dimension)

vectors = vectors.astype(np.float32)
faiss.normalize_L2(vectors)

index.add(vectors)

search_text = 'Director of manager,cofounder'

search_vector = encoder.encode([search_text])

search_vector = search_vector.astype(np.float32)
faiss.normalize_L2(search_vector.reshape(1, -1))

k = 5

distances, ann = index.search(search_vector, k=k)

# Convert distances to similarity scores
similarity_scores = 1 - distances

results = pd.DataFrame({'similarity_score': similarity_scores[0], 'ann': ann[0]})

merge = pd.merge(results, df, left_on='ann', right_index=True)

merge.head()

Unnamed: 0,similarity_score,ann,Title,Cleaned_Title
0,0.569984,1378,Manager and Director,Director
1,0.484653,1167,"Managing Director, COO",Managing Director
2,0.460247,1340,"Cofounder, CEO",Co-Founder
3,0.404546,1160,managing director,Managing Director
4,0.376013,1392,Manager of HR / COO,COO


# Measuring and Comparing Speed 

## Measuring Speed with IndexFlatL2

In [53]:
%%time
D, I = index.search(search_vector, k)  # search
print(I)

[[1378 1167 1340 1160 1392]]
CPU times: user 697 µs, sys: 12 µs, total: 709 µs
Wall time: 473 µs


## Speed with IndexIVFFlat (Faster, ~2000 training points )

In [54]:

nlist = 50  # how many cells
quantizer = faiss.IndexFlatL2(vector_dimension)
index = faiss.IndexIVFFlat(quantizer, vector_dimension, nlist)
index.train(vectors)  # train the index before adding vectors
index.add(vectors)
index.ntotal

k = 4

# Perform the search
distances, ann = index.search(search_vector, k=k)

# Create a DataFrame with the distances and indices of the nearest neighbors
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})

# Merge the results with the original DataFrame to get the titles of the nearest neighbors
merge = pd.merge(results, df, left_on='ann', right_index=True)

# Display the top results
merge.head()



Unnamed: 0,distances,ann,Title,Cleaned_Title
0,0.430016,1378,Manager and Director,Director
1,0.595454,1160,managing director,Managing Director
2,0.659228,578,director of crm,Director of CRM
3,0.675534,1219,CEO / Managing Director,CEO
