In [92]:
!pip install annoy

Defaulting to user installation because normal site-packages is not writeable


In [113]:
from annoy import AnnoyIndex
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
title_df = pd.read_csv('./all_titles.csv')
to_compare_df = pd.read_csv('./Insurance.csv')

# Create TF-IDF vectors for the titles in title_df
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(title_df['Title'])

# Build Annoy index
num_features = tfidf_matrix.shape[1]
annoy_index = AnnoyIndex(num_features, 'angular')
for i in range(tfidf_matrix.shape[0]):
    annoy_index.add_item(i, tfidf_matrix[i].toarray()[0])
annoy_index.build(10) # 10 trees

# Function to find the nearest title and similarity score using Annoy
def find_nearest_title_and_score(row):
    vector = tfidf_vectorizer.transform([row['Title']]).toarray()[0]
    nearest_idx, distance = annoy_index.get_nns_by_vector(vector, 1, include_distances=True)
    nearest_title = title_df.iloc[nearest_idx[0]]['Title']
    similarity_score = 1 - distance[0] # Convert distance to similarity
    return nearest_title, similarity_score

# Apply the function to the 'to_compare_df' dataframe
to_compare_df[['cleaned_titles', 'similarity_score']] = to_compare_df.apply(
    lambda row: find_nearest_title_and_score(row), axis=1, result_type='expand')

to_compare_df[['Title','cleaned_titles', 'similarity_score']]

Unnamed: 0,Title,cleaned_titles,similarity_score
0,Founder and Chairman,Founder,0.111442
1,Co CEO,Co-Founder,-0.004821
2,Chief Executive Officer,Chief Executive Officer,1.000000
3,Founding Partner,Founding Partner,1.000000
4,CEO,CEO,1.000000
...,...,...,...
762,Chief Executive Officer,Chief Executive Officer,1.000000
763,Chief Executive Officer,Chief Executive Officer,1.000000
764,"Founding Director, COO & CTO",COO,0.245964
765,Group Chief Operating Officer,Chief Operating Officer,1.000000
