In [1]:
 ! pip install faiss-cpu
 ! pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0
Defaulting to user installation because normal site-packages is not writeable


In [14]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss
import numpy as np

# Load the reference dataset from a CSV file
reference_data = pd.read_csv('all_titles.csv')

# Create a DataFrame with specific columns from the reference dataset
reference_df = pd.DataFrame(reference_data, columns=['Title', 'Cleaned_Title'])

# Load the dirty dataset from a CSV file
dirty_data = pd.read_csv('test_data.csv')

# Convert the 'Title' column to a list of strings from the dirty dataset
dirty_titles = dirty_data['Title'].astype(str).tolist()

# Initialize the sentence transformer model for encoding text
encoder = SentenceTransformer("paraphrase-mpnet-base-v2")

# Encode the titles from the reference dataset into vectors
vectors_reference = encoder.encode(reference_df['Title'].tolist())

# Get the dimensionality of the vectors
vector_dimension = vectors_reference.shape[1]

# Create a FAISS index for L2 distance (Euclidean distance) for the reference dataset
index_reference = faiss.IndexFlatL2(vector_dimension)

# Convert vectors to float32 for compatibility with FAISS, then normalize them
vectors_reference = vectors_reference.astype(np.float32)
faiss.normalize_L2(vectors_reference)

# Add the normalized vectors to the FAISS index
index_reference.add(vectors_reference)

# Encode the titles from the dirty dataset into vectors
vectors_dirty = encoder.encode(dirty_titles)

# Ensure the dirty vectors are of type float32 and normalize them
vectors_dirty = vectors_dirty.astype(np.float32)
faiss.normalize_L2(vectors_dirty)

# Number of nearest neighbors to find
k = 1

# Perform the search for each title in the dirty dataset against the reference dataset
ann_results = []
for vector in vectors_dirty:
    distances, indices = index_reference.search(vector.reshape(1, -1), k=k)
    ann_results.append((indices[0][0], distances[0][0]))

# Create a DataFrame with the ANN results
ann_df = pd.DataFrame(ann_results, columns=['ann', 'distance'])

# Add the 'Cleaned_Title' from the reference dataset to the ANN results
ann_df['Cleaned_Title'] = ann_df['ann'].apply(lambda idx: reference_df.iloc[idx]['Cleaned_Title'])

# Merge the ANN results with the original dirty DataFrame to get the titles
merged_df = pd.merge(ann_df, dirty_data, left_index=True, right_index=True)

# Display the top results
merged_df[['Title', 'Cleaned_Title', 'distance']].value_counts()

Title                                                   Cleaned_Title                           distance    
Managing Director                                       Managing Director                       0.000000e+00    4
Director                                                Director                                0.000000e+00    4
Chief Executive Officer                                 CEO                                     2.584935e-13    3
Business Owner                                          Owner                                   4.505834e-01    2
Founder                                                 Founder                                 0.000000e+00    2
Owner                                                   Owner                                   3.107757e-13    2
Director of Sales                                       Director of Sales                       0.000000e+00    2
Sales Director                                          Sales Director                       