# Step 1: Data preprocessing and importing (Getting started)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

we'll start off by the following steps:
- importing necessary libraries
- The data is messy, contain a lot of typos, unnecessary words, punctuations, etc, so we'll clean it using regex

In [3]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [4]:
import pandas as pd
import numpy as np
#nlp toolkit library helps us in giving us a list of stopwords that we use in cleaning data.
import nltk
from nltk.corpus import stopwords
import re
import random
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import fuzz
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pickle
from sklearn.metrics import classification_report
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor

In [3]:
df = pd.read_excel('/kaggle/input/product-matching-dataset/Product Matching Dataset.xlsx', sheet_name='Dataset')
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,2599,انتوكس 30 قرص,انتوكس 30قرص س ج,60.0
1,645,جابتن 100 مجم 30 كبسولة,GAPTIN CAPS 100 MG 30,81.0
2,775,دوكسيرازول 60 مجم 14 كبسول,دوكسيرازول 60مجم,72.75
3,1734,ابيكسيدون 3 مجم 30 قرص,ابيكسيدون 3مجم اقراص**,126.0
4,4743,ميكروسيرك 16 مجم 20 قرص,ميكروسيرك 16 مجم اقراص,29.0


In [4]:
nltk.download('stopwords')
stop_words = stopwords.words('arabic')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
stop_words.append('سج')
stop_words.append('سق')
stop_words.append('سعر')
stop_words.append('جديد')
stop_words.append('قديم')
stop_words.append('هام')
stop_words.append('العامريه')
stop_words.append('ادويا')
stop_words.append('ادويه')
stop_words.append('ركز')

In [6]:
def data_preprocessing(text):
  text = text.replace("أ", "ا")
  text = text.replace("آ", "ا")
  text = text.replace("إ", "ا")
  text = text.replace("ؤ", "و")
  text = text.replace("ئ", "ي")
  text = text.replace("ى", "ي")
  text = text.replace("ة", "ه")

  text = re.sub(r'[^\w\s]', ' ', text)
  text = re.sub(r'(\d*|\s*)جنيه$', ' ', text)
  text = re.sub(r'س\d*ج', " ", text)
  text = re.sub(r'س\d*ق', " ", text)
  text = re.sub(r"(\w)\1+", r'\1', text)
  text = re.sub(r"[ء-ي]*(ركيرل|ركريل)[ء-ي]*", " ", text)
  text = re.sub(r"\d+$", " ", text)
  text = re.sub(r"(\s)\1+", r"\1", text)
  text = text.strip()

  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text



In [7]:
df['marketplace_product_name_ar'] = df['marketplace_product_name_ar'].apply(data_preprocessing)
df['seller_item_name'] = df['seller_item_name'].apply(data_preprocessing)
df

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,2599,انتوكس 30 قرص,انتوكس 30قرص,60.00
1,645,جابتن 10 مجم 30 كبسوله,GAPTIN CAPS 10 MG,81.00
2,775,دوكسيرازول 60 مجم 14 كبسول,دوكسيرازول 60مجم,72.75
3,1734,ابيكسيدون 3 مجم 30 قرص,ابيكسيدون 3مجم اقراص,126.00
4,4743,ميكروسيرك 16 مجم 20 قرص,ميكروسيرك 16 مجم اقراص,29.00
...,...,...,...,...
83557,4188,ويستير 4 مجم 14 كيس,ويست اير اكياس,70.00
83558,291,بيبرا 20 مجم 14 قرص,بيبرا 20 مج 14 قرص,103.00
83559,5159,نافوبروكسين 50 مجم 5 اقماع,نافوبروكسين لبوس,10.00
83560,4957,توبمود 50 مجم 30 كبسوله,توب مود 50مجم 30كبسوله,17.50


In [8]:
df_unique = df.drop_duplicates()
df_unique

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,2599,انتوكس 30 قرص,انتوكس 30قرص,60.00
1,645,جابتن 10 مجم 30 كبسوله,GAPTIN CAPS 10 MG,81.00
2,775,دوكسيرازول 60 مجم 14 كبسول,دوكسيرازول 60مجم,72.75
3,1734,ابيكسيدون 3 مجم 30 قرص,ابيكسيدون 3مجم اقراص,126.00
4,4743,ميكروسيرك 16 مجم 20 قرص,ميكروسيرك 16 مجم اقراص,29.00
...,...,...,...,...
83531,642,فاستافلام 50 مجم 20 قرص,فاستافلام اقراص 36,36.00
83533,481,جليبتس بلس 50 850 مجم 30 قرص,جليبتس بلس850 50مجم اقراص120ج,192.00
83552,4957,توبمود 50 مجم 30 كبسوله,توب مود اقراص50مجم,17.50
83554,1051,اوندالينز 4 مجم 5 فيلم سريع الذوبان بالفم,اون دالينز 4جم لزقه,160.00


In [9]:
df_unique['label'] = 1
df_unique

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['label'] = 1


Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price,label
0,2599,انتوكس 30 قرص,انتوكس 30قرص,60.00,1
1,645,جابتن 10 مجم 30 كبسوله,GAPTIN CAPS 10 MG,81.00,1
2,775,دوكسيرازول 60 مجم 14 كبسول,دوكسيرازول 60مجم,72.75,1
3,1734,ابيكسيدون 3 مجم 30 قرص,ابيكسيدون 3مجم اقراص,126.00,1
4,4743,ميكروسيرك 16 مجم 20 قرص,ميكروسيرك 16 مجم اقراص,29.00,1
...,...,...,...,...,...
83531,642,فاستافلام 50 مجم 20 قرص,فاستافلام اقراص 36,36.00,1
83533,481,جليبتس بلس 50 850 مجم 30 قرص,جليبتس بلس850 50مجم اقراص120ج,192.00,1
83552,4957,توبمود 50 مجم 30 كبسوله,توب مود اقراص50مجم,17.50,1
83554,1051,اوندالينز 4 مجم 5 فيلم سريع الذوبان بالفم,اون دالينز 4جم لزقه,160.00,1


we'll put label 1 for the correctly matched drugs, and 0 for the unmatched, the data doesn't contain unmatched drugs, so we'll generate 5 unmatched examples for each drug.
we'll do this so the model can learn correctly the examples that indicate unmatch, and the examples that indicate a match.

In [11]:
df_master_file = pd.read_excel('/kaggle/input/product-matching-dataset/Product Matching Dataset.xlsx', sheet_name='Master File')
df_master_file.head()

Unnamed: 0,sku,product_name,product_name_ar,price
0,279,ANAFRONIL 75 MG 20 TAB,انافرونيل 75 مجم اس ار 20 قرص,75.0
1,2282,LOPRECOUGH SYRUP 100 ML,لوبريكاف شراب 100 مل,28.5
2,4331,TOMEX PLUS 50 TAB,تومكس بلس 50 قرص,60.0
3,1022,TAROLIMUS 0.03% OINT. 15 GM,تاروليمس 0.03 % مرهم 15 جم,129.0
4,116,GLIPTUS PLUS 50/1000 MG 30 TAB,جليبتس بلس 50/1000 مجم 30 قرص,192.0


In [12]:
def generate_negative_pairs(master, DF):
    #iterate over dataset df
    #for each sku (medicine), generate negative pairs for it (negative matches)
    arr = defaultdict(list)
    for idx in DF['sku']:
        #generate for example 5 negative matches, being put after the positive pairs,
        #the negative pairs can be (different medicine from master file: different sku, different sku but has high similarity)
        i = 0
        while i < 5:
            choice = random.choice(DF['sku'].unique())
            while choice == idx:
                choice = random.choice(DF['sku'].unique())
            drug_b = random.choice(DF.loc[DF['sku']==choice, 'seller_item_name'].to_list())
            arr[idx].append(drug_b)
            i += 1
    return arr

neg_pairs = generate_negative_pairs(df_master_file, df_unique)

In [13]:
new_rows = []

# Iterate over negative_pairs to create new rows
for sku, negatives in neg_pairs.items():
    # Get the correct marketplace product name for this SKU
    correct_name = df.loc[df['sku'] == sku, 'marketplace_product_name_ar'].values[0]

    # Create new rows for each negative pair
    for negative_drug in negatives:
        new_rows.append({'sku': sku, 
                         'marketplace_product_name_ar': correct_name, 
                         'seller_item_name': negative_drug, 
                         'label': 0})

# Convert new_rows into a DataFrame
negative_df = pd.DataFrame(new_rows)
negative_df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,label
0,2599,انتوكس 30 قرص,نيفيلوب 2 5 مجم 14قرص,0
1,2599,انتوكس 30 قرص,كالوماك مس,0
2,2599,انتوكس 30 قرص,ميوفين 3 شريط سعرجديد,0
3,2599,انتوكس 30 قرص,اميجراويست2 5مجم,0
4,2599,انتوكس 30 قرص,ايسوبتن 240مجم ريتارد 3شريط,0


In [15]:
final_df = pd.concat([df_unique, negative_df], ignore_index=True)
final_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price,label
0,2599,انتوكس 30 قرص,انتوكس 30قرص,60.00,1
1,645,جابتن 10 مجم 30 كبسوله,GAPTIN CAPS 10 MG,81.00,1
2,775,دوكسيرازول 60 مجم 14 كبسول,دوكسيرازول 60مجم,72.75,1
3,1734,ابيكسيدون 3 مجم 30 قرص,ابيكسيدون 3مجم اقراص,126.00,1
4,4743,ميكروسيرك 16 مجم 20 قرص,ميكروسيرك 16 مجم اقراص,29.00,1
...,...,...,...,...,...
167605,46,كتافاست 50 مجم 9 كيس,انافرانيل 25 اقراص3شريط نوفارتس,,0
167606,46,كتافاست 50 مجم 9 كيس,ايراستابكس40مجم 3شريط,,0
167607,46,كتافاست 50 مجم 9 كيس,فاكتو مرهم 30جرام,,0
167608,46,كتافاست 50 مجم 9 كيس,فلدين 6 امبول احترس,,0


# Step 2: feature extraction and model training
we'll do the following in this step:
- convert the text data into numerical data using tf-idf vectorizer
- create the similarity features that we'll pass to the model as training
- splitting the data and train and evaluate the model

In [16]:
# TF-IDF vectorizer
tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
all_names = pd.concat([final_df['marketplace_product_name_ar'], final_df['seller_item_name']])
tfidf.fit(all_names)

def compute_features(row):
    name1, name2 = row['marketplace_product_name_ar'], row['seller_item_name']
    vec1 = tfidf.transform([name1])
    vec2 = tfidf.transform([name2])
    
    cosine_sim = cosine_similarity(vec1, vec2)[0][0]  
    levenshtein = fuzz.ratio(name1, name2) / 100  
    jaro_winkler = fuzz.WRatio(name1, name2) / 100

    return pd.Series([cosine_sim, levenshtein, jaro_winkler])

final_df[['cosine_sim', 'levenshtein', 'jaro_winkler']] = final_df.apply(compute_features, axis=1)
final_df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price,label,cosine_sim,levenshtein,jaro_winkler
0,2599,انتوكس 30 قرص,انتوكس 30قرص,60.0,1,0.846046,0.96,0.96
1,645,جابتن 10 مجم 30 كبسوله,GAPTIN CAPS 10 MG,81.0,1,0.032208,0.205128,0.24359
2,775,دوكسيرازول 60 مجم 14 كبسول,دوكسيرازول 60مجم,72.75,1,0.789758,0.761905,0.855
3,1734,ابيكسيدون 3 مجم 30 قرص,ابيكسيدون 3مجم اقراص,126.0,1,0.727418,0.857143,0.857143
4,4743,ميكروسيرك 16 مجم 20 قرص,ميكروسيرك 16 مجم اقراص,29.0,1,0.90757,0.888889,0.888889


In [17]:
X = final_df[['cosine_sim', 'levenshtein', 'jaro_winkler']]
y = final_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

In [18]:
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     35130
           1       0.97      0.94      0.96      6773

    accuracy                           0.99     41903
   macro avg       0.98      0.97      0.98     41903
weighted avg       0.99      0.99      0.99     41903



the drugs that has multiple variations (multiple concentrations), we'll consider the output of any of them will be Not sure, so a person could manually check and review them, beceause the model might easily predict a wrong variation of the drug because of how similar and close they are.

In [19]:
medicine_frequency = defaultdict(int)
for full_drug in df_master_file['product_name']:
    drug = re.split(r'\d+', full_drug)
    medicine_frequency[drug[0]]+=1
    
for full_drug in df_master_file['product_name_ar']:
    drug = re.split(r'\d+', full_drug)
    medicine_frequency[drug[0]]+=1

# medicine_frequency

In [20]:
def find_best_match(input_name, known_names, model, tfidf):
    similarities = []
    features_list = []
    if 97 <= ord(input_name[0].lower()) <= 122:
        for known_name in known_names['product_name']:
            vec1 = tfidf.transform([input_name])
            vec2 = tfidf.transform([known_name])
    
            cosine_sim = cosine_similarity(vec1, vec2)[0][0]
            levenshtein = fuzz.ratio(input_name, known_name) / 100
            jaro_winkler = fuzz.WRatio(input_name, known_name) / 100
            
            features = np.array([[cosine_sim, levenshtein, jaro_winkler]])
            prob = model.predict_proba(features)[0][1]  # Match probability
            similarities.append((known_name, prob))
            
    else:
        for known_name in known_names['product_name_ar']:
            vec1 = tfidf.transform([input_name])
            vec2 = tfidf.transform([known_name])
            
            cosine_sim = cosine_similarity(vec1, vec2)[0][0]
            levenshtein = fuzz.ratio(input_name, known_name) / 100
            jaro_winkler = fuzz.WRatio(input_name, known_name) / 100
        
            features = np.array([[cosine_sim, levenshtein, jaro_winkler]])
            prob = model.predict_proba(features)[0][1]  # Match probability
            similarities.append((known_name, prob))
    # Get the best match
    best_match = max(similarities, key=lambda x: x[1])
    if (best_match[1]) < 0.95 or medicine_frequency[re.split(r'\d+', best_match[0])[0]] > 1:
        return (best_match, 'Not sure')
    return (best_match, 'sure')

find_best_match('esthl3t MG', df_master_file, xgb_model, tfidf)


(('ESTOHALT 40 MG 14 CAP', 0.9903679), 'sure')

the "find_best_match()" function is used for finding the best match for a single medicine.

but what if we want to pass in a sheet of medicines and get the best match for every one of them?
the problem of this is that it'll take time if the sheet is long, so what can we do?
Here comes the power of "parallel programming", we'll use ThreadPools to find the best match for a number of medicines (4 for example) at the same time.

In [5]:
test_df = pd.read_excel('/kaggle/input/product-matching-dataset/Product Matching Dataset.xlsx', sheet_name='Dataset')
test_df = test_df.sample(frac=1).reset_index(drop=True)
test_df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,3689,اتورستات 10 مجم 14 قرص,اتورستات 10مجم 14قرص,40.0
1,1751,برونتو بلس 20 قرص,برونتو بلس اقراص,26.0
2,533,اميبرايد 50 مجم 20 قرص,اميبريد 50مجم 20 قرص/سعر جديد,81.0
3,1303,نوستامين قطرة 15 مل,نوستامين 15 ملل قطرة,15.0
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم كبسول سعر جديد,56.5


In [None]:
def find_best_matches(medicines, master, model, tfidf):
    results = []
    for medicine in medicines:
        match = (medicine, find_best_match(medicine, master, model, tfidf))
        print(match)
        results.append(match)
    return results

def find_best_matches_parallel(full_medicines, master, model, tfidf, n_jobs=4):
    chunks = np.array_split(full_medicines, n_jobs)
    
    with ThreadPoolExecutor(max_workers=n_jobs) as executor:
        try:
            results = list(executor.map(lambda chunk: find_best_matches(chunk, master, model, tfidf), chunks))
        except KeyboardInterrupt:
            print("Interrupted! Shutting down threads...")
            executor.shutdown(wait=False, cancel_futures=True)
        return [item for sublist in results for item in sublist]

results = find_best_matches_parallel(lst, df_master_file, xgb_model, tfidf)

# Step 3: Model Deployment
- we'll build a class that contains all the steps we did, just for the case of retraining on a new dataset or anything.
- we also saved the class object, which contains the model and other important attributes like: tfidf, medicine_frequency dictionary, which both of them are used in the find_best_match function.

In [9]:
class MedicineMatcher():
    def __init__(self, model, tfidf, data_for_training, master_file, medicine_frequency):
        self.model = model
        self.tfidf = tfidf
        self.data = data_for_training
        self.master = master_file
        self.m_freq = medicine_frequency

    def __generate_stop_words(self):
        nltk.download('stopwords')
        stop_words = stopwords.words('arabic')
        stop_words.append('سج')
        stop_words.append('سق')
        stop_words.append('سعر')
        stop_words.append('جديد')
        stop_words.append('قديم')
        stop_words.append('هام')
        stop_words.append('العامريه')
        stop_words.append('ادويا')
        stop_words.append('ادويه')
        stop_words.append('ركز')
        return stop_words

    def __data_cleaning(self, text):
        text = text.replace("أ", "ا")
        text = text.replace("آ", "ا")
        text = text.replace("إ", "ا")
        text = text.replace("ؤ", "و")
        text = text.replace("ئ", "ي")
        text = text.replace("ى", "ي")
        text = text.replace("ة", "ه")
        
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'(\d*|\s*)جنيه$', ' ', text)
        text = re.sub(r'س\d*ج', " ", text)
        text = re.sub(r'س\d*ق', " ", text)
        text = re.sub(r"(\w)\1+", r'\1', text)
        text = re.sub(r"[ء-ي]*(ركيرل|ركريل)[ء-ي]*", " ", text)
        text = re.sub(r"\d+$", " ", text)
        text = re.sub(r"(\s)\1+", r"\1", text)
        text = text.strip()

        text = ' '.join([word for word in text.split() if word not in stop_words])
        return text


    def __generate_negative_pairs(self, DF):
        arr = defaultdict(list)
        for idx in DF['sku']:
            #generate for example 5 negative matches, being put after the positive pairs,
            #the negative pairs can be (different medicine from master file: different sku, different sku but has high similarity)
            i = 0
            while i < 5:
                choice = random.choice(DF['sku'].unique())
                while choice == idx:
                    choice = random.choice(DF['sku'].unique())
                drug_b = random.choice(DF.loc[DF['sku']==choice, 'seller_item_name'].to_list())
                arr[idx].append(drug_b)
                i += 1
                
        new_rows = []

        # Iterate over negative_pairs to create new rows
        for sku, negatives in arr.items():
            # Get the correct marketplace product name for this SKU
            correct_name = self.data.loc[self.data['sku'] == sku, 'marketplace_product_name_ar'].values[0]
        
            # Create new rows for each negative pair
            for negative_drug in negatives:
                new_rows.append({'sku': sku, 
                                 'marketplace_product_name_ar': correct_name, 
                                 'seller_item_name': negative_drug, 
                                 'label': 0})
        
        # Convert new_rows into a DataFrame
        negative_df = pd.DataFrame(new_rows)
        return negative_df

    def data_preprocessing(self):
        stop_words = self.__generate_stop_words()
        self.data['marketplace_product_name_ar'] = self.data['marketplace_product_name_ar'].apply(self.__data_cleaning)
        self.data['seller_item_name'] = self.data['seller_item_name'].apply(self.__data_cleaning)
        df_unique = self.data.drop_duplicates()
        df_unique['label'] = 1
        negative_df = self.__generate_negative_pairs(df_unique)
        self.data = pd.concat([df_unique, negative_df], ignore_index=True)



    def __compute_similarities(self, row):
        name1, name2 = row['marketplace_product_name_ar'], row['seller_item_name']
        vec1 = self.tfidf.transform([name1])
        vec2 = self.tfidf.transform([name2])

        cosine_sim = cosine_similarity(vec1, vec2)[0][0]
        levenshtein = fuzz.ratio(name1, name2) / 100
        jaro_winkler = fuzz.WRatio(name1, name2) / 100

        return pd.Series([cosine_sim, levenshtein, jaro_winkler])
        
    def __medicineFrequency(self):
        for full_drug in self.master['product_name']:
            drug = re.split(r'\d+', full_drug)
            self.m_freq[drug[0]] += 1

        for full_drug in self.master['product_name_ar']:
            drug = re.split(r'\d+', full_drug)
            self.m_freq[drug[0]] += 1

    def Fit(self):
        # data preparation step
        self.tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
        all_names = pd.concat([self.data['marketplace_product_name_ar'], self.data['seller_item_name']])
        self.tfidf.fit(all_names)
        self.data[['cosine_sim', 'levenshtein', 'jaro_winkler']] = self.data.apply(self.__compute_similarities, axis=1)
        self.__medicineFrequency()
        
        #training
        X = self.data[['cosine_sim', 'levenshtein', 'jaro_winkler']]
        y = self.data['label']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        self.model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        self.model.fit(X_train, y_train)

    def evaluate(self):
        y_pred = xgb_model.predict(X_test)
        print(classification_report(y_test, y_pred))

    def find_best_match(self, input_name):
        """
        input_name: medicine name to be matched
        """
        similarities = []
        features_list = []
        if 97 <= ord(input_name[0].lower()) <= 122:
            for known_name in self.master['product_name']:
                vec1 = self.tfidf.transform([input_name])
                vec2 = self.tfidf.transform([known_name])
        
                cosine_sim = cosine_similarity(vec1, vec2)[0][0]
                levenshtein = fuzz.ratio(input_name, known_name) / 100
                jaro_winkler = fuzz.WRatio(input_name, known_name) / 100
                
                features = np.array([[cosine_sim, levenshtein, jaro_winkler]])
                prob = self.model.predict_proba(features)[0][1]  # Match probability
                similarities.append((known_name, prob))
                
        else:
            for known_name in self.master['product_name_ar']:
                vec1 = self.tfidf.transform([input_name])
                vec2 = self.tfidf.transform([known_name])
                
                cosine_sim = cosine_similarity(vec1, vec2)[0][0]
                levenshtein = fuzz.ratio(input_name, known_name) / 100
                jaro_winkler = fuzz.WRatio(input_name, known_name) / 100
            
                features = np.array([[cosine_sim, levenshtein, jaro_winkler]])
                prob = self.model.predict_proba(features)[0][1]  # Match probability
                similarities.append((known_name, prob))
        # Get the best match
        best_match = max(similarities, key=lambda x: x[1])
        if (best_match[1]) < 0.95 or self.m_freq[re.split(r'\d+', best_match[0])[0]] > 1:
            return (best_match, 'Not sure')
        return (best_match, 'sure')

    
    def __find_best_matches(self, medicines):
        results = []
        for medicine in medicines:
            match = (medicine, self.find_best_match(medicine))
            print(match)
            results.append(match)
        return results
    
    def find_best_matches_parallel(self, full_medicines, n_jobs=4):
        """
        full_medicines: a list of medicine names that need to be matched
        n_jobs: number of jobs to be done in parallel using ThreadPooling, used in splitting
        the list into chunks so each core will handle a chunk
        """
        chunks = np.array_split(full_medicines, n_jobs)
        
        with ThreadPoolExecutor(max_workers=n_jobs) as executor:
            try:
                results = list(executor.map(lambda chunk: self.__find_best_matches(chunk), chunks))
            except KeyboardInterrupt:
                print("Interrupted! Shutting down threads...")
                executor.shutdown(wait=False, cancel_futures=True)
            return [item for sublist in results for item in sublist]


In [None]:
medicine_matcher = MedicineMatcher(xgb_model, tfidf, final_df, df_master_file, medicine_frequency)

In [None]:
medicine_matcher.find_best_match('brofen')

In [None]:
# save the current object
with open("medicine_matcher.pkl", "wb") as f:
    pickle.dump(medicine_matcher, f)

In [10]:
# Load the saved class
with open("/kaggle/input/medicine-matcher/medicine_matcher.pkl", "rb") as f:
    matcher = pickle.load(f)

In [11]:
matcher.find_best_match('estoh3lt')

(('ESTOHALT 40 MG 14 CAP', 0.9995497), 'sure')

In [24]:
test_df = pd.read_excel("/kaggle/input/product-matching-dataset/Product Matching Dataset.xlsx", sheet_name="Dataset")
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [25]:
lst = test_df['seller_item_name'][:20].to_list()

In [26]:
lst

['اوبلكس ان شراب س.ج',
 'ايمباكوزا 25مجم 30قرص ',
 'X-TENSION PLUS 150MG/12.5 MG 28 TAB',
 'اريكتاليس 2 قرص "تادالافيل',
 'بيوتين فورت 60 كبسول',
 'فاكتو مرهم س.ج',
 'اتوريزا10/40اقراص',
 'ديبرام 20 مجم اقراص',
 'فورتيموكس قطرة اوركيديا',
 'سالبوفنت شراب س ج',
 'بيوفيت 12 ديبو 2 امبول',
 'نازوكورت سبراى 151جنيه',
 'ازموراب 40مجم 14قرص ابيكس',
 'فلورست 20قرص**',
 'افوصويا30كبسولة',
 'انتى كوكس اا 15 30ق س ج ادويه',
 'مايوديورا 10مجم 3شريط',
 'نوفيستوريك10 مجم اقراص',
 'بكتيكلور 125 شراب',
 'اتاكاند4مجم %20س ج']

In [27]:
matcher.find_best_matches_parallel(lst)

('فاكتو مرهم س.ج', (('فاكتو مرهم 30 جم', 0.9995689), 'sure'))
('انتى كوكس اا 15 30ق س ج ادويه', (('انتيكوكس 15 مجم 30 قرص', 0.99618137), 'Not sure'))
('اوبلكس ان شراب س.ج', (('اوبلكس- ان شراب 125 مل', 0.9970078), 'sure'))
('بيوفيت 12 ديبو 2 امبول', (('بيوفيت 12 ديبو 2 امبول عضل', 0.9995542), 'sure'))
('اتوريزا10/40اقراص', (('اتوريزا 10/10 مجم 28 قرص', 0.9926933), 'Not sure'))
('مايوديورا 10مجم 3شريط', (('مايوديورا 10 مجم 30 قرص', 0.9952592), 'sure'))
('ايمباكوزا 25مجم 30قرص ', (('ايمباكوزا 25 مجم 30 قرص', 0.99963176), 'Not sure'))
('نازوكورت سبراى 151جنيه', (('دوجماتيل فورت 200 مجم 10 قرص', 0.26995596), 'Not sure'))
('ديبرام 20 مجم اقراص', (('ديبرام 20 مجم 20 قرص', 0.9946924), 'sure'))
('X-TENSION PLUS 150MG/12.5 MG 28 TAB', (('X-TENSION PLUS 150MG/12.5 MG 28 TAB', 0.99921775), 'Not sure'))
('ازموراب 40مجم 14قرص ابيكس', (('ازموراب 40 مجم 14 كبسول', 0.9971348), 'sure'))
('نوفيستوريك10 مجم اقراص', (('نوفيستوريك 10 مجم 14 قرص', 0.99944514), 'Not sure'))
('فورتيموكس قطرة اوركيديا', (('فورت

[('اوبلكس ان شراب س.ج', (('اوبلكس- ان شراب 125 مل', 0.9970078), 'sure')),
 ('ايمباكوزا 25مجم 30قرص ',
  (('ايمباكوزا 25 مجم 30 قرص', 0.99963176), 'Not sure')),
 ('X-TENSION PLUS 150MG/12.5 MG 28 TAB',
  (('X-TENSION PLUS 150MG/12.5 MG 28 TAB', 0.99921775), 'Not sure')),
 ('اريكتاليس 2 قرص "تادالافيل',
  (('اريكتاليس 20 مجم 2 قرص', 0.9985758), 'sure')),
 ('بيوتين فورت 60 كبسول',
  (('بيوتين فورت 5 مجم 60 كبسولة', 0.9978046), 'sure')),
 ('فاكتو مرهم س.ج', (('فاكتو مرهم 30 جم', 0.9995689), 'sure')),
 ('اتوريزا10/40اقراص', (('اتوريزا 10/10 مجم 28 قرص', 0.9926933), 'Not sure')),
 ('ديبرام 20 مجم اقراص', (('ديبرام 20 مجم 20 قرص', 0.9946924), 'sure')),
 ('فورتيموكس قطرة اوركيديا',
  (('فورتيموكس بلس قطرة عين 5 مل', 0.9978556), 'sure')),
 ('سالبوفنت شراب س ج',
  (('سالبوفنت 2 مجم /5 مل شراب 120 مل', 0.99835134), 'sure')),
 ('بيوفيت 12 ديبو 2 امبول',
  (('بيوفيت 12 ديبو 2 امبول عضل', 0.9995542), 'sure')),
 ('نازوكورت سبراى 151جنيه',
  (('دوجماتيل فورت 200 مجم 10 قرص', 0.26995596), 'Not sure')),