In [2]:
pip install fuzzywuzzy python-Levenshtein textdistance


Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-win_amd64.whl.metadata (3.6 kB)
Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-win_amd64.whl (100 kB)
Installing collected packages: fuzzywuzzy, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.27.1 fuzzywuzzy-0.18.0 python-Levenshtein-0.27.1
Note: you may need to restart the kernel to use updated packages.




In [3]:
import pandas as pd
from fuzzywuzzy import fuzz, process
from difflib import SequenceMatcher
import textdistance

In [4]:
data = {
    "Agency": ["Dept of Health", "Information Tech", "City Planning", "Finance Dept", "Environmental Dept"],
    "Business Title": ["Data Analyst", "Software Engineer", "Urban Data Specialist", "Budget Analyst", "Climate Data Scientist"],
    "Job Description": [
        "Analyze public health data and create dashboards.",
        "Develop and maintain city-wide applications.",
        "Work with urban data to help city development.",
        "Analyze budget reports and assist in forecasting.",
        "Study climate trends and produce reports."
    ]
}


In [5]:
df = pd.DataFrame(data)

In [6]:
df.head()

Unnamed: 0,Agency,Business Title,Job Description
0,Dept of Health,Data Analyst,Analyze public health data and create dashboards.
1,Information Tech,Software Engineer,Develop and maintain city-wide applications.
2,City Planning,Urban Data Specialist,Work with urban data to help city development.
3,Finance Dept,Budget Analyst,Analyze budget reports and assist in forecasting.
4,Environmental Dept,Climate Data Scientist,Study climate trends and produce reports.


In [7]:
# Create a combined corpus for matching
corpus = df['Agency'].tolist() + df['Business Title'].tolist() + df['Job Description'].tolist()


In [8]:
# Search input
search_term = "data analysis specialist"

In [9]:
def match_fuzzywuzzy(search, corpus, limit=3):
    return process.extract(search, corpus, scorer=fuzz.token_set_ratio, limit=limit)

def match_difflib(search, corpus, limit=3):
    scores = [(text, SequenceMatcher(None, search, text).ratio()) for text in corpus]
    return sorted(scores, key=lambda x: x[1], reverse=True)[:limit]

def match_textdistance(search, corpus, similarity_func, limit=3):
    scores = [(text, similarity_func.normalized_similarity(search, text)) for text in corpus]
    return sorted(scores, key=lambda x: x[1], reverse=True)[:limit]

In [10]:
# Run experiments
results = {
    "FuzzyWuzzy": match_fuzzywuzzy(search_term, corpus),
    "difflib": match_difflib(search_term, corpus),
    "TextDistance (Jaccard)": match_textdistance(search_term, corpus, textdistance.jaccard),
    "TextDistance (Cosine)": match_textdistance(search_term, corpus, textdistance.cosine),
}

In [11]:
# Display results
for method, matches in results.items():
    print(f"\n--- {method} ---")
    for match in matches:
        print(match)


--- FuzzyWuzzy ---
('Urban Data Specialist', 83)
('Data Analyst', 67)
('Climate Data Scientist', 61)

--- difflib ---
('Urban Data Specialist', 0.5777777777777777)
('Data Analyst', 0.5555555555555556)
('Budget Analyst', 0.47368421052631576)

--- TextDistance (Jaccard) ---
('Urban Data Specialist', 0.5517241379310345)
('Climate Data Scientist', 0.4838709677419355)
('Develop and maintain city-wide applications.', 0.44680851063829785)

--- TextDistance (Cosine) ---
('Urban Data Specialist', 0.7126966450997984)
('Climate Data Scientist', 0.6527912098338668)
('Develop and maintain city-wide applications.', 0.6462303276414968)


In [12]:
import pandas as pd
from rapidfuzz import process, fuzz