Task 1

In [1]:
import pandas as pd
import numpy as np

In [3]:
df_new_queries=pd.read_csv('new_queries.csv')
print(df_new_queries.head())
df_resolved_queries=pd.read_csv('resolved_queries.csv')
print(df_resolved_queries.head())

                             Variation_Query  Matches_With_Query_ID
0           Unabel to conect to the internet                      1
1                  Can’t connect to internet                      1
2                        Intenet not working                      1
3               Payment failed while chekout                      2
4  Payment did not go through during chckout                      2
   Query_ID                    Pre_Resolved_Query
0         1     Unable to connect to the internet
1         2        Payment failed during checkout
2         3     App crashes when opening settings
3         4   Forgot password and unable to reset
4         5  Unable to upload files to the server


In [4]:
!pip install rapidfuzz scikit-learn

import pandas as pd
import re
from rapidfuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



# Preprocessing Function

def preprocess(text):
    text = str(text).lower()               # lowercase
    text = re.sub(r"[^a-z0-9\s]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df_new_queries['clean_query'] = df_new_queries['Variation_Query'].apply(preprocess)
df_resolved_queries['clean_resolved'] = df_resolved_queries['Pre_Resolved_Query'].apply(preprocess)


# 1. FUZZY MATCHING

def fuzzy_match(query, choices, scorer, threshold=80):
    match, score, idx = process.extractOne(query, choices, scorer=scorer)
    if score >= threshold:
        return match, score
    else:
        return None, score

fuzzy_results = []
for q in df_new_queries['clean_query']:
    match, score = fuzzy_match(q,
                               df_resolved_queries['clean_resolved'],
                               scorer=fuzz.token_set_ratio,
                               threshold=80)  # adjust threshold if needed
    fuzzy_results.append((q, match, score))

df_fuzzy = pd.DataFrame(fuzzy_results, columns=['New_Query', 'Matched_Resolved', 'Fuzzy_Score'])

print("\n--- Fuzzy Matching Results ---")
print(df_fuzzy)


# 2. TF-IDF + COSINE SIMILARITY

# Combine queries for vectorization
all_queries = df_resolved_queries['clean_resolved'].tolist()

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_queries)

cosine_results = []
for q in df_new_queries['clean_query']:
    q_vec = vectorizer.transform([q])
    cosine_sim = cosine_similarity(q_vec, tfidf_matrix)[0]
    best_idx = cosine_sim.argmax()
    best_score = cosine_sim[best_idx]
    matched_query = df_resolved_queries.iloc[best_idx]['clean_resolved']
    cosine_results.append((q, matched_query, best_score))

df_cosine = pd.DataFrame(cosine_results, columns=['New_Query', 'Matched_Resolved', 'Cosine_Similarity'])

print("\n--- TF-IDF + Cosine Similarity Results ---")
print(df_cosine)


# Combine results

df_combined = pd.concat([
    df_new_queries[['Variation_Query']],
    df_fuzzy[['Matched_Resolved', 'Fuzzy_Score']],
    df_cosine[['Matched_Resolved', 'Cosine_Similarity']]
], axis=1)

print("\n Combined Matching Results :")
print(df_combined)


Collecting rapidfuzz
  Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.0

--- Fuzzy Matching Results ---
                                    New_Query  \
0            unabel to conect to the internet   
1                   can t connect to internet   
2                         intenet not working   
3                payment failed while chekout   
4   payment did not go through during chckout   
5                  payment issue at check out   
6    application crashes when opening setings   
7            app crash when going to settings   
8            settings cause the app to chrash   
9               forgot passwrd and cant reset   

In [2]:
# Install dependencies
!pip install rapidfuzz scikit-learn nltk

import pandas as pd
import re
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from rapidfuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

# --------------------
# Load Data
# --------------------
df_new_queries = pd.read_csv('new_queries.csv')
df_resolved_queries = pd.read_csv('resolved_queries.csv')

# --------------------
# Preprocessing with Lemmatization
# --------------------
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)

df_new_queries['clean_query'] = df_new_queries['Variation_Query'].apply(preprocess)
df_resolved_queries['clean_resolved'] = df_resolved_queries['Pre_Resolved_Query'].apply(preprocess)

# --------------------
# 1. FUZZY MATCHING (always return best match)
# --------------------
def fuzzy_best_match(query, choices, scorer, threshold=65):
    match, score, idx = process.extractOne(query, choices, scorer=scorer)
    flag = "yes" if score >= threshold else "no"
    return match, score, flag

fuzzy_results = []
for q in df_new_queries['clean_query']:
    match, score, flag = fuzzy_best_match(q,
                                          df_resolved_queries['clean_resolved'],
                                          scorer=fuzz.token_set_ratio,
                                          threshold=65)
    fuzzy_results.append((q, match, score, flag))

df_fuzzy = pd.DataFrame(fuzzy_results, columns=['New_Query_Clean', 'Fuzzy_Match', 'Fuzzy_Score', 'Fuzzy_Flag'])

# --------------------
# 2. TF-IDF + COSINE SIMILARITY
# --------------------
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_resolved_queries['clean_resolved'])

cosine_results = []
for q in df_new_queries['clean_query']:
    q_vec = vectorizer.transform([q])
    cosine_sim = cosine_similarity(q_vec, tfidf_matrix)[0]
    best_idx = cosine_sim.argmax()
    best_score = cosine_sim[best_idx]
    matched_query = df_resolved_queries.iloc[best_idx]['clean_resolved']
    cosine_results.append((q, matched_query, round(best_score, 3)))

df_cosine = pd.DataFrame(cosine_results, columns=['New_Query_Clean', 'Cosine_Match', 'Cosine_Similarity'])

# --------------------
# Merge Results
# --------------------
df_final = pd.concat([
    df_new_queries[['Variation_Query']],
    df_fuzzy[['Fuzzy_Match', 'Fuzzy_Score', 'Fuzzy_Flag']],
    df_cosine[['Cosine_Match', 'Cosine_Similarity']]
], axis=1)

print("\n--- Final Matching Results ---")
print(df_final.to_string(index=False))




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



--- Final Matching Results ---
                          Variation_Query                         Fuzzy_Match  Fuzzy_Score Fuzzy_Flag                        Cosine_Match  Cosine_Similarity
         Unabel to conect to the internet   unable to connect to the internet    94.915254        yes   unable to connect to the internet              0.839
                Can’t connect to internet   unable to connect to the internet    86.363636        yes   unable to connect to the internet              0.837
                      Intenet not working   unable to connect to the internet    48.979592         no   unable to connect to the internet              0.000
             Payment failed while chekout      payment failed during checkout    82.758621        yes      payment failed during checkout              0.707
Payment did not go through during chckout      payment failed during checkout    70.422535        yes      payment failed during checkout              0.707
               Payment iss

In [4]:
# --------------------
# Add Ground Truth from Query_ID
# --------------------
df_truth = df_new_queries.merge(
    df_resolved_queries,
    left_on="Matches_With_Query_ID",
    right_on="Query_ID",
    how="left"
)
df_truth = df_truth.rename(columns={"Pre_Resolved_Query": "Ground_Truth"})

# --------------------
# Accuracy Calculation
# --------------------
# Fuzzy Accuracy
fuzzy_correct = (df_fuzzy["Fuzzy_Match"].str.strip().str.lower() ==
                 df_truth["Ground_Truth"].apply(preprocess).str.strip().str.lower()).sum()
fuzzy_accuracy = fuzzy_correct / len(df_truth) * 100

# Cosine Accuracy
cosine_correct = (df_cosine["Cosine_Match"].str.strip().str.lower() ==
                  df_truth["Ground_Truth"].apply(preprocess).str.strip().str.lower()).sum()
cosine_accuracy = cosine_correct / len(df_truth) * 100

print("\n--- Accuracy Scores ---")
print(f"Fuzzy Matching Accuracy: {fuzzy_accuracy:.2f}% ({fuzzy_correct}/{len(df_truth)})")
print(f"Cosine Similarity Accuracy: {cosine_accuracy:.2f}% ({cosine_correct}/{len(df_truth)})")



--- Accuracy Scores ---
Fuzzy Matching Accuracy: 100.00% (20/20)
Cosine Similarity Accuracy: 100.00% (20/20)


Task 2

In [5]:
df_base_names=pd.read_csv('base_names.csv')
print(df_base_names.head())
df_name_variations=pd.read_csv('name_variations.csv')
print(df_name_variations.head())

   Base_Name_ID         Base_Name
0             1        John Smith
1             2    Jennifer Brown
2             3  Michael O'Connor
3             4      Maria Garcia
4             5        Robert Lee
      Variation Matches_With_Base_Name
0  Thomas  King            Thomas King
1    ThomasKing            Thomas King
2  Maria Garcia           Maria Garcia
3     MaryLewis             Mary Lewis
4      Nancy W.           Nancy Wright


In [6]:

!pip install rapidfuzz nltk

import pandas as pd
import re
import nltk
from rapidfuzz import fuzz, process

nltk.download('punkt')

# Load Data

df_base_names = pd.read_csv('base_names.csv')
df_name_variations = pd.read_csv('name_variations.csv')


# Preprocessing Function for Names

def preprocess_name(name):
    name = str(name).lower().strip()
    name = re.sub(r"[^a-z\s]", " ", name)  # remove punctuation/numbers
    tokens = nltk.word_tokenize(name)
    tokens = sorted(tokens)  # sort tokens so 'Smith John' == 'John Smith'
    return " ".join(tokens)

df_base_names['clean_base'] = df_base_names['Base_Name'].apply(preprocess_name)
df_name_variations['clean_variation'] = df_name_variations['Variation'].apply(preprocess_name)


# Fuzzy Matching Function

def fuzzy_best_match(query, choices, scorer, threshold=80):
    match, score, idx = process.extractOne(query, choices, scorer=scorer)
    flag = "yes" if score >= threshold else "no"
    return match, score, flag

fuzzy_results = []
for q in df_name_variations['clean_variation']:
    match, score, flag = fuzzy_best_match(q,
                                          df_base_names['clean_base'],
                                          scorer=fuzz.token_sort_ratio,
                                          threshold=80)
    fuzzy_results.append((q, match, score, flag))

df_fuzzy = pd.DataFrame(fuzzy_results, columns=['Variation_Clean', 'Fuzzy_Match_Clean', 'Fuzzy_Score', 'Fuzzy_Flag'])


# Merge Matches Back to Original Names

# Map clean matches back to original base names
clean_to_original = dict(zip(df_base_names['clean_base'], df_base_names['Base_Name']))

df_fuzzy['Fuzzy_Match'] = df_fuzzy['Fuzzy_Match_Clean'].map(clean_to_original)
df_final = pd.concat([
    df_name_variations[['Variation', 'Matches_With_Base_Name']],
    df_fuzzy[['Fuzzy_Match', 'Fuzzy_Score', 'Fuzzy_Flag']]
], axis=1)

print("\n--- Final Name Matching Results ---")
print(df_final.to_string(index=False))

# Accuracy Calculation

correct = (df_final['Fuzzy_Match'].str.lower().str.strip() ==
           df_final['Matches_With_Base_Name'].str.lower().str.strip()).sum()
accuracy = correct / len(df_final) * 100

print(f"\nFuzzy Matching Accuracy: {accuracy:.2f}% ({correct}/{len(df_final)})")



--- Final Name Matching Results ---
         Variation Matches_With_Base_Name       Fuzzy_Match  Fuzzy_Score Fuzzy_Flag
      Thomas  King            Thomas King       Thomas King   100.000000        yes
        ThomasKing            Thomas King       Thomas King    57.142857         no
      Maria Garcia           Maria Garcia      Maria Garcia   100.000000        yes
         MaryLewis             Mary Lewis        Mary Lewis    52.631579         no
          Nancy W.           Nancy Wright      Nancy Wright    73.684211         no
      Dani3l Scott           Daniel Scott      Daniel Scott    91.666667        yes
       JOHN  smith             John Smith        John Smith   100.000000        yes
     linda johnson          Linda Johnson     Linda Johnson   100.000000        yes
      N@ncy Wright           Nancy Wright      Nancy Wright    91.666667        yes
     William Davis          William Davis     William Davis   100.000000        yes
      Susan  Clark            Susan Cla

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
