In [51]:
!pip install fuzzywuzzy



In [52]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import fuzz, process
from sklearn.metrics.pairwise import cosine_similarity

In [53]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Loading Resolved and New Queries Dataset

In [54]:
resolved_queries = pd.read_csv('/content/resolved_queries.csv')
new_queries = pd.read_csv('/content/new_queries.csv')

In [55]:
resolved_queries.head()

Unnamed: 0,Query_ID,Pre_Resolved_Query
0,1,Unable to connect to the internet
1,2,Payment failed during checkout
2,3,App crashes when opening settings
3,4,Forgot password and unable to reset
4,5,Unable to upload files to the server


In [56]:
new_queries.head()

Unnamed: 0,Variation_Query,Matches_With_Query_ID
0,Unabel to conect to the internet,1
1,Can’t connect to internet,1
2,Intenet not working,1
3,Payment failed while chekout,2
4,Payment did not go through during chckout,2


In [57]:
resolved_queries.shape, new_queries.shape

((5, 2), (20, 2))

#### Preprocessing Datasets

In [58]:
def preprocess_text(text):
  text = text.lower()
  text = text.translate(str.maketrans('', '', string.punctuation))
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]

  return ' '.join(tokens)

In [59]:
resolved_queries['cleaned_queries'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess_text)
new_queries['cleaned_queries'] = new_queries['Variation_Query'].apply(preprocess_text)

#### Fuzzy Matching

In [60]:
def fuzzy_match(query, choices, threshold = 80):
  best_match = process.extractOne(query, choices, scorer = fuzz.token_sort_ratio)

  if best_match[1] >= threshold:
    return best_match[0], best_match[1]
  return None, 0

In [61]:
new_queries['fuzzy_match'] = new_queries['cleaned_queries'].apply(lambda x: fuzzy_match(x, resolved_queries['cleaned_queries'].tolist()))

In [62]:
new_queries['fuzzy_match_query'], new_queries['fuzzy_match_score'] = zip(*new_queries['fuzzy_match'])

#### Tf-IDF and Cosine Similarity

In [63]:
tfidf_vec = TfidfVectorizer()

In [64]:
combined_queries = pd.concat([new_queries['cleaned_queries'], resolved_queries['cleaned_queries']])

In [65]:
tfidf_combined = tfidf_vec.fit_transform(combined_queries)

In [66]:
# resolved_tfidf_matrix = tfidf_combined[len(resolved_queries):]
# new_tfidf_matrix = tfidf_combined[:len(new_queries)]

In [68]:
new_tfidf_matrix = tfidf_combined[:len(new_queries)]  # For unresolved queries (20 rows)
resolved_tfidf_matrix = tfidf_combined[len(new_queries):len(new_queries) + len(resolved_queries)]  # For resolved queries (5 rows)

In [69]:
cosine_sim = cosine_similarity(new_tfidf_matrix, resolved_tfidf_matrix)

In [70]:
best_matches = cosine_sim.argmax(axis = 1)

In [71]:
new_queries['tfidf_best_match'] = best_matches
new_queries['tfidf_best_match_query'] = resolved_queries.iloc[best_matches]['Pre_Resolved_Query'].values
new_queries['tfidf_best_match_score'] = cosine_sim.max(axis = 1)

#### Fuzzy Search and Tf-IDF + Cosine Similarity Results

In [74]:
new_queries[['Variation_Query', 'fuzzy_match_query', 'fuzzy_match_score']].head()

Unnamed: 0,Variation_Query,fuzzy_match_query,fuzzy_match_score
0,Unabel to conect to the internet,unable connect internet,93
1,Can’t connect to internet,unable connect internet,82
2,Intenet not working,,0
3,Payment failed while chekout,payment failed checkout,98
4,Payment did not go through during chckout,,0


In [73]:
new_queries[['Variation_Query', 'tfidf_best_match_query', 'tfidf_best_match_score']].head()

Unnamed: 0,Variation_Query,tfidf_best_match_query,tfidf_best_match_score
0,Unabel to conect to the internet,Unable to connect to the internet,0.25641
1,Can’t connect to internet,Unable to connect to the internet,0.858168
2,Intenet not working,Unable to connect to the internet,0.0
3,Payment failed while chekout,Payment failed during checkout,0.558907
4,Payment did not go through during chckout,Payment failed during checkout,0.219802
