# Step 1: Data preprocessing and importing (Getting started)

In [23]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

we'll start off by the following steps:
- importing necessary libraries
- The data is messy, contain a lot of typos, unnecessary words, punctuations, etc, so we'll clean it using regex

In [24]:
!pip install rapidfuzz



In [25]:
import pandas as pd
import numpy as np
#nlp toolkit library helps us in giving us a list of stopwords that we use in cleaning data.
import nltk
from nltk.corpus import stopwords
import re
import random
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import fuzz
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pickle
from sklearn.metrics import classification_report
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor

In [26]:
df = pd.read_excel('/kaggle/input/product-matching-dataset/Product Matching Dataset.xlsx', sheet_name='Dataset')
df_master_file = pd.read_excel('/kaggle/input/product-matching-dataset/Product Matching Dataset.xlsx', sheet_name='Master File')
df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5


In [27]:
nltk.download('stopwords')
stop_words = stopwords.words('arabic')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
stop_words.append('سج')
stop_words.append('سق')
stop_words.append('سعر')
stop_words.append('جديد')
stop_words.append('قديم')
stop_words.append('هام')
stop_words.append('العامريه')
stop_words.append('ادويا')
stop_words.append('ادويه')
stop_words.append('ركز')

In [29]:
def data_preprocessing(text):
  text = text.replace("أ", "ا")
  text = text.replace("آ", "ا")
  text = text.replace("إ", "ا")
  text = text.replace("ؤ", "و")
  text = text.replace("ئ", "ي")
  text = text.replace("ى", "ي")
  text = text.replace("ة", "ه")

  text = re.sub(r'[^\w\s]', ' ', text)
  text = re.sub(r'(\d*|\s*)جنيه$', ' ', text)
  text = re.sub(r'س\d*ج', " ", text)
  text = re.sub(r'س\d*ق', " ", text)
  text = re.sub(r"(\w)\1+", r'\1', text)
  text = re.sub(r"[ء-ي]*(ركيرل|ركريل)[ء-ي]*", " ", text)
  text = re.sub(r"\d+$", " ", text)
  text = re.sub(r"(\s)\1+", r"\1", text)
  text = text.strip()

  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text



In [30]:
df['marketplace_product_name_ar'] = df['marketplace_product_name_ar'].apply(data_preprocessing)
df['seller_item_name'] = df['seller_item_name'].apply(data_preprocessing)
df

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14,56.5
...,...,...,...,...
83557,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35,39.0
83558,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35 مل اقراص,39.0
83559,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35 مجم 30 قرص,39.0
83560,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35 مجم ام ار اقراص,39.0


In [31]:
df_unique = df.drop_duplicates()
df_unique

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14,56.5
7,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 1 شريط 14 كبسوله,56.5
8,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم,56.5
9,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 كبسول,56.5
...,...,...,...,...
83554,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35مج 30قرص,39.0
83555,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35م,39.0
83556,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35ام ار 30قرص,39.0
83558,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35 مل اقراص,39.0


In [32]:
df_unique['label'] = 1
df_unique

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['label'] = 1


Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price,label
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5,1
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14,56.5,1
7,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 1 شريط 14 كبسوله,56.5,1
8,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم,56.5,1
9,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 كبسول,56.5,1
...,...,...,...,...,...
83554,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35مج 30قرص,39.0,1
83555,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35م,39.0,1
83556,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35ام ار 30قرص,39.0,1
83558,1603,سبانيلا ام ار 35 مجم 30 قرص,سبانيلا 35 مل اقراص,39.0,1


we'll put label 1 for the correctly matched drugs, and 0 for the unmatched, the data doesn't contain unmatched drugs, so we'll generate 5 unmatched examples for each drug.
we'll do this so the model can learn correctly the examples that indicate unmatch, and the examples that indicate a match.

In [33]:
def generate_negative_pairs(master, DF):
    #iterate over dataset df
    #for each sku (medicine), generate negative pairs for it (negative matches)
    arr = defaultdict(list)
    for idx in DF['sku']:
        #generate for example 5 negative matches, being put after the positive pairs,
        #the negative pairs can be (different medicine from master file: different sku, different sku but has high similarity)
        i = 0
        while i < 5:
            choice = random.choice(DF['sku'].unique())
            while choice == idx:
                choice = random.choice(DF['sku'].unique())
            drug_b = random.choice(DF.loc[DF['sku']==choice, 'seller_item_name'].to_list())
            arr[idx].append(drug_b)
            i += 1
    return arr

neg_pairs = generate_negative_pairs(df_master_file, df_unique)
len(neg_pairs)

500

In [34]:
new_rows = []

# Iterate over negative_pairs to create new rows
for sku, negatives in neg_pairs.items():
    # Get the correct marketplace product name for this SKU
    correct_name = df.loc[df['sku'] == sku, 'marketplace_product_name_ar'].values[0]

    # Create new rows for each negative pair
    for negative_drug in negatives:
        new_rows.append({'sku': sku, 
                         'marketplace_product_name_ar': correct_name, 
                         'seller_item_name': negative_drug, 
                         'label': 0})

# Convert new_rows into a DataFrame
negative_df = pd.DataFrame(new_rows)
negative_df

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,label
0,1322,استوهالت 40 مجم 14 كبسول,ديلاترول 25م,0
1,1322,استوهالت 40 مجم 14 كبسول,تكلو,0
2,1322,استوهالت 40 مجم 14 كبسول,OSTEOCARE SYRUP 1 120 ML,0
3,1322,استوهالت 40 مجم 14 كبسول,اماريل ام 2 50 مجم 30 قرص,0
4,1322,استوهالت 40 مجم 14 كبسول,ايمباكوزا 10 مج,0
...,...,...,...,...
139670,1603,سبانيلا ام ار 35 مجم 30 قرص,فليكس بخاخ لانف 16 مجم,0
139671,1603,سبانيلا ام ار 35 مجم 30 قرص,كاندلكان 4مجم 14قرص 2شريط,0
139672,1603,سبانيلا ام ار 35 مجم 30 قرص,دولفن 12 5 لبوس شريطين,0
139673,1603,سبانيلا ام ار 35 مجم 30 قرص,بانتولوك 20 مجم 14,0


In [35]:
final_df = pd.concat([df_unique, negative_df], ignore_index=True)
final_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price,label
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5,1
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14,56.5,1
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 1 شريط 14 كبسوله,56.5,1
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم,56.5,1
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 كبسول,56.5,1
...,...,...,...,...,...
167605,1603,سبانيلا ام ار 35 مجم 30 قرص,فليكس بخاخ لانف 16 مجم,,0
167606,1603,سبانيلا ام ار 35 مجم 30 قرص,كاندلكان 4مجم 14قرص 2شريط,,0
167607,1603,سبانيلا ام ار 35 مجم 30 قرص,دولفن 12 5 لبوس شريطين,,0
167608,1603,سبانيلا ام ار 35 مجم 30 قرص,بانتولوك 20 مجم 14,,0


# Step 2: feature extraction and model training
we'll do the following in this step:
- convert the text data into numerical data using tf-idf vectorizer
- create the similarity features that we'll pass to the model as training
- splitting the data and train and evaluate the model

In [36]:
# TF-IDF vectorizer
tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
all_names = pd.concat([final_df['marketplace_product_name_ar'], final_df['seller_item_name']])
tfidf.fit(all_names)

def compute_features(row):
    name1, name2 = row['marketplace_product_name_ar'], row['seller_item_name']
    vec1 = tfidf.transform([name1])
    vec2 = tfidf.transform([name2])
    
    cosine_sim = cosine_similarity(vec1, vec2)[0][0]  
    levenshtein = fuzz.ratio(name1, name2) / 100  
    jaro_winkler = fuzz.WRatio(name1, name2) / 100

    return pd.Series([cosine_sim, levenshtein, jaro_winkler])

final_df[['cosine_sim', 'levenshtein', 'jaro_winkler']] = final_df.apply(compute_features, axis=1)
final_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price,label,cosine_sim,levenshtein,jaro_winkler
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5,1,0.075436,0.355556,0.365385
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14,56.5,1,0.931288,0.857143,0.950000
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 1 شريط 14 كبسوله,56.5,1,0.865271,0.857143,0.857143
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم,56.5,1,0.896021,0.769231,0.900000
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 كبسول,56.5,1,0.954837,0.829268,0.950000
...,...,...,...,...,...,...,...,...
167605,1603,سبانيلا ام ار 35 مجم 30 قرص,فليكس بخاخ لانف 16 مجم,,0,0.042908,0.408163,0.408163
167606,1603,سبانيلا ام ار 35 مجم 30 قرص,كاندلكان 4مجم 14قرص 2شريط,,0,0.045227,0.461538,0.461538
167607,1603,سبانيلا ام ار 35 مجم 30 قرص,دولفن 12 5 لبوس شريطين,,0,0.006187,0.285714,0.285714
167608,1603,سبانيلا ام ار 35 مجم 30 قرص,بانتولوك 20 مجم 14,,0,0.081792,0.444444,0.855000


In [37]:
X = final_df[['cosine_sim', 'levenshtein', 'jaro_winkler']]
y = final_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

In [38]:
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     35130
           1       0.98      0.95      0.96      6773

    accuracy                           0.99     41903
   macro avg       0.98      0.97      0.98     41903
weighted avg       0.99      0.99      0.99     41903



the drugs that has multiple variations (multiple concentrations), we'll consider the output of any of them will be Not sure, so a person could manually check and review them, beceause the model might easily predict a wrong variation of the drug because of how similar and close they are.

In [39]:
medicine_frequency = defaultdict(int)
for full_drug in df_master_file['product_name']:
    drug = re.split(r'\d+', full_drug)
    medicine_frequency[drug[0]]+=1
    
for full_drug in df_master_file['product_name_ar']:
    drug = re.split(r'\d+', full_drug)
    medicine_frequency[drug[0]]+=1

# medicine_frequency

In [41]:
def find_best_match(input_name, known_names, model, tfidf):
    similarities = []
    features_list = []
    if 97 <= ord(input_name[0].lower()) <= 122:
        for known_name in known_names['product_name']:
            vec1 = tfidf.transform([input_name])
            vec2 = tfidf.transform([known_name])
    
            cosine_sim = cosine_similarity(vec1, vec2)[0][0]
            levenshtein = fuzz.ratio(input_name, known_name) / 100
            jaro_winkler = fuzz.WRatio(input_name, known_name) / 100
            
            features = np.array([[cosine_sim, levenshtein, jaro_winkler]])
            prob = model.predict_proba(features)[0][1]  # Match probability
            similarities.append((known_name, prob))
            
    else:
        for known_name in known_names['product_name_ar']:
            vec1 = tfidf.transform([input_name])
            vec2 = tfidf.transform([known_name])
            
            cosine_sim = cosine_similarity(vec1, vec2)[0][0]
            levenshtein = fuzz.ratio(input_name, known_name) / 100
            jaro_winkler = fuzz.WRatio(input_name, known_name) / 100
        
            features = np.array([[cosine_sim, levenshtein, jaro_winkler]])
            prob = model.predict_proba(features)[0][1]  # Match probability
            similarities.append((known_name, prob))
    # Get the best match
    best_match = max(similarities, key=lambda x: x[1])
    if (best_match[1]) < 0.95 or medicine_frequency[re.split(r'\d+', best_match[0])[0]] > 1:
        return (best_match, 'Not sure')
    return (best_match, 'sure')

find_best_match('esthl3t MG', df_master_file, xgb_model, tfidf)


(('ESTOHALT 40 MG 14 CAP', 0.99923146), 'sure')

the "find_best_match()" function is used for finding the best match for a single medicine.

but what if we want to pass in a sheet of medicines and get the best match for every one of them?
the problem of this is that it'll take time if the sheet is long, so what can we do?
Here comes the power of "parallel programming", we'll use ThreadPools to find the best match for a number of medicines (4 for example) at the same time.

In [42]:
test_df = pd.read_excel('/kaggle/input/product-matching-dataset/Product Matching Dataset.xlsx', sheet_name='Dataset')
test_df = test_df.sample(frac=1).reset_index(drop=True)
lst = test_df.loc[:100, 'seller_item_name']
lst

0                      فلدين 6 امبول
1                 ميلجا اقراص 4 شريط
2             بلافكس اقراص سعر 236  
3                       ابيمول اقراص
4                     دايت سويت 100ق
                   ...              
96               سى فيت نقط س جديد *
97     انافرانيل75اس ار2شريط سعرجديد
98             سولو فريش قطرة س.ج 36
99              اوميز20 مجم 14 ك/س ج
100               ريمو واكس      نقط
Name: seller_item_name, Length: 101, dtype: object

In [None]:
def find_best_matches(medicines, master, model, tfidf):
    results = []
    for medicine in medicines:
        match = (medicine, find_best_match(medicine, master, model, tfidf))
        print(match)
        results.append(match)
    return results

def find_best_matches_parallel(full_medicines, master, model, tfidf, n_jobs=4):
    chunks = np.array_split(full_medicines, n_jobs)
    
    with ThreadPoolExecutor(max_workers=n_jobs) as executor:
        try:
            results = list(executor.map(lambda chunk: find_best_matches(chunk, master, model, tfidf), chunks))
        except KeyboardInterrupt:
            print("Interrupted! Shutting down threads...")
            executor.shutdown(wait=False, cancel_futures=True)
        return [item for sublist in results for item in sublist]

results = find_best_matches_parallel(lst, df_master_file, xgb_model, tfidf)
results

# Step 3: Model Deployment
- we'll build a class that contains all the steps we did, just for the case of retraining on a new dataset or anything.
- we also saved the class object, which contains the model and other important attributes like: tfidf, medicine_frequency dictionary, which both of them are used in the find_best_match function.

In [9]:
class MedicineMatcher():
    def __init__(self, model, tfidf, data_for_training, master_file, medicine_frequency):
        self.model = model
        self.tfidf = tfidf
        self.data = data_for_training
        self.master = master_file
        self.m_freq = medicine_frequency

    def __generate_stop_words(self):
        nltk.download('stopwords')
        stop_words = stopwords.words('arabic')
        stop_words.append('سج')
        stop_words.append('سق')
        stop_words.append('سعر')
        stop_words.append('جديد')
        stop_words.append('قديم')
        stop_words.append('هام')
        stop_words.append('العامريه')
        stop_words.append('ادويا')
        stop_words.append('ادويه')
        stop_words.append('ركز')
        return stop_words

    def __data_cleaning(self, text):
        text = text.replace("أ", "ا")
        text = text.replace("آ", "ا")
        text = text.replace("إ", "ا")
        text = text.replace("ؤ", "و")
        text = text.replace("ئ", "ي")
        text = text.replace("ى", "ي")
        text = text.replace("ة", "ه")
        
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'(\d*|\s*)جنيه$', ' ', text)
        text = re.sub(r'س\d*ج', " ", text)
        text = re.sub(r'س\d*ق', " ", text)
        text = re.sub(r"(\w)\1+", r'\1', text)
        text = re.sub(r"[ء-ي]*(ركيرل|ركريل)[ء-ي]*", " ", text)
        text = re.sub(r"\d+$", " ", text)
        text = re.sub(r"(\s)\1+", r"\1", text)
        text = text.strip()

        text = ' '.join([word for word in text.split() if word not in stop_words])
        return text


    def __generate_negative_pairs(self, DF):
        arr = defaultdict(list)
        for idx in DF['sku']:
            #generate for example 5 negative matches, being put after the positive pairs,
            #the negative pairs can be (different medicine from master file: different sku, different sku but has high similarity)
            i = 0
            while i < 5:
                choice = random.choice(DF['sku'].unique())
                while choice == idx:
                    choice = random.choice(DF['sku'].unique())
                drug_b = random.choice(DF.loc[DF['sku']==choice, 'seller_item_name'].to_list())
                arr[idx].append(drug_b)
                i += 1
                
        new_rows = []

        # Iterate over negative_pairs to create new rows
        for sku, negatives in arr.items():
            # Get the correct marketplace product name for this SKU
            correct_name = self.data.loc[self.data['sku'] == sku, 'marketplace_product_name_ar'].values[0]
        
            # Create new rows for each negative pair
            for negative_drug in negatives:
                new_rows.append({'sku': sku, 
                                 'marketplace_product_name_ar': correct_name, 
                                 'seller_item_name': negative_drug, 
                                 'label': 0})
        
        # Convert new_rows into a DataFrame
        negative_df = pd.DataFrame(new_rows)
        return negative_df

    def data_preprocessing(self):
        stop_words = self.__generate_stop_words()
        self.data['marketplace_product_name_ar'] = self.data['marketplace_product_name_ar'].apply(self.__data_cleaning)
        self.data['seller_item_name'] = self.data['seller_item_name'].apply(self.__data_cleaning)
        df_unique = self.data.drop_duplicates()
        df_unique['label'] = 1
        negative_df = self.__generate_negative_pairs(df_unique)
        self.data = pd.concat([df_unique, negative_df], ignore_index=True)



    def __compute_similarities(self, row):
        name1, name2 = row['marketplace_product_name_ar'], row['seller_item_name']
        vec1 = self.tfidf.transform([name1])
        vec2 = self.tfidf.transform([name2])

        cosine_sim = cosine_similarity(vec1, vec2)[0][0]
        levenshtein = fuzz.ratio(name1, name2) / 100
        jaro_winkler = fuzz.WRatio(name1, name2) / 100

        return pd.Series([cosine_sim, levenshtein, jaro_winkler])
        
    def __medicineFrequency(self):
        for full_drug in self.master['product_name']:
            drug = re.split(r'\d+', full_drug)
            self.m_freq[drug[0]] += 1

        for full_drug in self.master['product_name_ar']:
            drug = re.split(r'\d+', full_drug)
            self.m_freq[drug[0]] += 1

    def Fit(self):
        # data preparation step
        self.tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
        all_names = pd.concat([self.data['marketplace_product_name_ar'], self.data['seller_item_name']])
        self.tfidf.fit(all_names)
        self.data[['cosine_sim', 'levenshtein', 'jaro_winkler']] = self.data.apply(self.__compute_similarities, axis=1)
        self.__medicineFrequency()
        
        #training
        X = self.data[['cosine_sim', 'levenshtein', 'jaro_winkler']]
        y = self.data['label']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        self.model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        self.model.fit(X_train, y_train)

    def evaluate(self):
        y_pred = xgb_model.predict(X_test)
        print(classification_report(y_test, y_pred))

    def find_best_match(self, input_name):
        """
        input_name: medicine name to be matched
        """
        similarities = []
        features_list = []
        if 97 <= ord(input_name[0].lower()) <= 122:
            for known_name in self.master['product_name']:
                vec1 = self.tfidf.transform([input_name])
                vec2 = self.tfidf.transform([known_name])
        
                cosine_sim = cosine_similarity(vec1, vec2)[0][0]
                levenshtein = fuzz.ratio(input_name, known_name) / 100
                jaro_winkler = fuzz.WRatio(input_name, known_name) / 100
                
                features = np.array([[cosine_sim, levenshtein, jaro_winkler]])
                prob = self.model.predict_proba(features)[0][1]  # Match probability
                similarities.append((known_name, prob))
                
        else:
            for known_name in self.master['product_name_ar']:
                vec1 = self.tfidf.transform([input_name])
                vec2 = self.tfidf.transform([known_name])
                
                cosine_sim = cosine_similarity(vec1, vec2)[0][0]
                levenshtein = fuzz.ratio(input_name, known_name) / 100
                jaro_winkler = fuzz.WRatio(input_name, known_name) / 100
            
                features = np.array([[cosine_sim, levenshtein, jaro_winkler]])
                prob = self.model.predict_proba(features)[0][1]  # Match probability
                similarities.append((known_name, prob))
        # Get the best match
        best_match = max(similarities, key=lambda x: x[1])
        if (best_match[1]) < 0.95 or self.m_freq[re.split(r'\d+', best_match[0])[0]] > 1:
            return (best_match, 'Not sure')
        return (best_match, 'sure')

    
    def __find_best_matches(self, medicines):
        results = []
        for medicine in medicines:
            match = (medicine, self.find_best_match(medicine))
            print(match)
            results.append(match)
        return results
    
    def find_best_matches_parallel(self, full_medicines, n_jobs=4):
        """
        full_medicines: a list of medicine names that need to be matched
        n_jobs: number of jobs to be done in parallel using ThreadPooling, used in splitting
        the list into chunks so each core will handle a chunk
        """
        chunks = np.array_split(full_medicines, n_jobs)
        
        with ThreadPoolExecutor(max_workers=n_jobs) as executor:
            try:
                results = list(executor.map(lambda chunk: self.__find_best_matches(chunk), chunks))
            except KeyboardInterrupt:
                print("Interrupted! Shutting down threads...")
                executor.shutdown(wait=False, cancel_futures=True)
            return [item for sublist in results for item in sublist]


In [43]:
medicine_matcher = MedicineMatcher(xgb_model, tfidf, final_df, df_master_file, medicine_frequency)

In [44]:
medicine_matcher.find_best_match('brofen')

(('BRUFEN 400 MG 30 TAB', 0.9966131), 'Not sure')

In [46]:
# save the current object
with open("medicine_matcher.pkl", "wb") as f:
    pickle.dump(medicine_matcher, f)

In [47]:
# Load the saved class
with open("medicine_matcher.pkl", "rb") as f:
    matcher = pickle.load(f)

In [48]:
matcher.find_best_match('estoh3lt')

(('ESTOHALT 40 MG 14 CAP', 0.9995497), 'sure')