# Medicine Recommender System

In [34]:
import pandas as pd 
import numpy as np

In [35]:
df = pd.read_csv('dataset/medicine.csv')

In [36]:
df.head()

Unnamed: 0,index,Drug_Name,Reason,Description
0,1,A CN Gel(Topical) 20gmA CN Soap 75gm,Acne,Mild to moderate acne (spots)
1,2,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,Acne,A RET 0.025% is a prescription medicine that i...
2,3,ACGEL CL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...
3,4,ACGEL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...
4,5,Acleen 1% Lotion 25ml,Acne,treat the most severe form of acne (nodular ac...


In [37]:
df.shape

(9720, 4)

In [38]:
df.isnull().sum()

index          0
Drug_Name      0
Reason         0
Description    0
dtype: int64

In [39]:
df.duplicated().sum()

0

In [40]:
df.describe()

Unnamed: 0,index
count,9720.0
mean,4860.5
std,2806.066642
min,1.0
25%,2430.75
50%,4860.5
75%,7290.25
max,9720.0


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9720 entries, 0 to 9719
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   index        9720 non-null   int64 
 1   Drug_Name    9720 non-null   object
 2   Reason       9720 non-null   object
 3   Description  9720 non-null   object
dtypes: int64(1), object(3)
memory usage: 303.9+ KB


In [42]:
df.Description

0                           Mild to moderate acne (spots)
1       A RET 0.025% is a prescription medicine that i...
2       It is used to treat acne vulgaris in people 12...
3       It is used to treat acne vulgaris in people 12...
4       treat the most severe form of acne (nodular ac...
                              ...                        
9715                              used for treating warts
9716                        used to soften the skin cells
9717                                       used for scars
9718                                      used for wounds
9719    used to treat and remove raised warts (usually...
Name: Description, Length: 9720, dtype: object

In [43]:
# Splitting each value in the 'Description' column into a list of words (using whitespace as a delimiter)
df['Description'] = df['Description'].apply(lambda x: x.split())

# Splitting each value in the 'Reason' column into a list of words (using whitespace as a delimiter)
df['Reason'] = df['Reason'].apply(lambda x: x.split())

In [44]:
df.head()

Unnamed: 0,index,Drug_Name,Reason,Description
0,1,A CN Gel(Topical) 20gmA CN Soap 75gm,[Acne],"[Mild, to, moderate, acne, (spots)]"
1,2,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,[Acne],"[A, RET, 0.025%, is, a, prescription, medicine..."
2,3,ACGEL CL NANO Gel 15gm,[Acne],"[It, is, used, to, treat, acne, vulgaris, in, ..."
3,4,ACGEL NANO Gel 15gm,[Acne],"[It, is, used, to, treat, acne, vulgaris, in, ..."
4,5,Acleen 1% Lotion 25ml,[Acne],"[treat, the, most, severe, form, of, acne, (no..."


In [45]:
# For each value in the 'Description' column, replace any spaces within the individual words by iterating through the list and removing spaces from each element.
df['Description'] = df['Description'].apply(lambda x: [i.replace(" ", "") for i in x])

In [46]:
# Concatenating the lists in the 'Description' and 'Reason' columns element-wise to create a new 'tags' column.
df['tags'] = df['Description'] + df['Reason']

In [47]:
df.head()

Unnamed: 0,index,Drug_Name,Reason,Description,tags
0,1,A CN Gel(Topical) 20gmA CN Soap 75gm,[Acne],"[Mild, to, moderate, acne, (spots)]","[Mild, to, moderate, acne, (spots), Acne]"
1,2,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,[Acne],"[A, RET, 0.025%, is, a, prescription, medicine...","[A, RET, 0.025%, is, a, prescription, medicine..."
2,3,ACGEL CL NANO Gel 15gm,[Acne],"[It, is, used, to, treat, acne, vulgaris, in, ...","[It, is, used, to, treat, acne, vulgaris, in, ..."
3,4,ACGEL NANO Gel 15gm,[Acne],"[It, is, used, to, treat, acne, vulgaris, in, ...","[It, is, used, to, treat, acne, vulgaris, in, ..."
4,5,Acleen 1% Lotion 25ml,[Acne],"[treat, the, most, severe, form, of, acne, (no...","[treat, the, most, severe, form, of, acne, (no..."


In [48]:
# Creating a new DataFrame 'new_df' that contains only the 'index', 'Drug_Name', and 'tags' columns from the 'data' DataFrame.
new_df = df[['index', 'Drug_Name', 'tags']]

In [49]:
new_df.head()

Unnamed: 0,index,Drug_Name,tags
0,1,A CN Gel(Topical) 20gmA CN Soap 75gm,"[Mild, to, moderate, acne, (spots), Acne]"
1,2,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,"[A, RET, 0.025%, is, a, prescription, medicine..."
2,3,ACGEL CL NANO Gel 15gm,"[It, is, used, to, treat, acne, vulgaris, in, ..."
3,4,ACGEL NANO Gel 15gm,"[It, is, used, to, treat, acne, vulgaris, in, ..."
4,5,Acleen 1% Lotion 25ml,"[treat, the, most, severe, form, of, acne, (no..."


In [50]:
# Joining the list of words in the 'tags' column into a single string separated by spaces.
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [51]:
# Converting all characters in the 'tags' column to lowercase.
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [52]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

# Initializing CountVectorizer with stop words set to English and limiting features to the top 5000 most frequent words
cv = CountVectorizer(stop_words="english", max_features=5000)

In [54]:
# Defining a function 'stem' that stems each word in the input text
def stem(text):
    y = []
    for i in text.split():  # Splitting the text into individual words
        y.append(ps.stem(i))  # Applying the PorterStemmer to each word and appending the result to the list
    return " ".join(y)  # Joining the stemmed words back into a single string

In [55]:
# Testing the stem function on a sample string
stem("You are so so beautiful")

'you are so so beauti'

In [56]:
# Applying the stem function to the 'tags' column of the new_data DataFrame to stem each word in the 'tags'
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [58]:
new_df.head()

Unnamed: 0,index,Drug_Name,tags
0,1,A CN Gel(Topical) 20gmA CN Soap 75gm,mild to moder acn (spots) acn
1,2,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,a ret 0.025% is a prescript medicin that is us...
2,3,ACGEL CL NANO Gel 15gm,it is use to treat acn vulgari in peopl 12 yea...
3,4,ACGEL NANO Gel 15gm,it is use to treat acn vulgari in peopl 12 yea...
4,5,Acleen 1% Lotion 25ml,treat the most sever form of acn (nodular acne...


In [59]:
# Transforming the 'tags' column in 'new_df' into a matrix of token counts using CountVectorizer, 
# and converting the result into an array (one-hot encoding representation of words).
vectors = cv.fit_transform(new_df['tags']).toarray()

In [60]:
vectors.shape

(9720, 806)

In [66]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [67]:
cv.get_feature_names_out()

array(['025', '12', '16', '18', 'abdomin', 'abl', 'ach', 'acid', 'acn',
       'acne', 'acquir', 'action', 'activ', 'acut', 'acute', 'adequ',
       'adhd', 'adjunct', 'adolesc', 'adult', 'adults', 'affect', 'ag',
       'age', 'aids', 'allerg', 'allergen', 'allergi', 'allow', 'alon',
       'alzheim', 'alzheimer', 'alzheimerâ', 'amoebiasi', 'anaemia',
       'anal', 'angina', 'angl', 'ani', 'ankylos', 'anorexia', 'anoth',
       'anti', 'antioxid', 'antipsychot', 'antiretrovir', 'anxieti',
       'anxiou', 'anxious', 'apnoea', 'appear', 'appetit', 'appetite',
       'appli', 'appropri', 'area', 'arrhythmia', 'arrhythmiasi',
       'arteri', 'arthralgia', 'arthriti', 'associ', 'atherothrombot',
       'athleteâ', 'atop', 'atrial', 'attack', 'awak', 'b1', 'b2', 'b3',
       'b5', 'b6', 'babi', 'backache', 'bacteri', 'bacteria', 'balanc',
       'balanitis', 'bandag', 'becom', 'behaviour', 'beliefs', 'benefit',
       'beta', 'biliari', 'biotin', 'bite', 'blackhead', 'blackheads',
      

In [69]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculating the cosine similarity between all the vectors in the 'vectors' array
similarity = cosine_similarity(vectors)

In [70]:
# Displaying the similarity values for the second item (index 1) in the similarity matrix
similarity[1]

array([0.25197632, 1.        , 0.25660012, ..., 0.19245009, 0.1490712 ,
       0.0860663 ])

In [80]:
# Defining a function 'recommendation' that recommends similar medicines based on cosine similarity
def recommendation(medicine):
    # Finding the index of the medicine in 'new_df' that matches the input 'medicine' name
    medicine_index = new_df[new_df['Drug_Name'] == medicine].index[0] 
    
    # Retrieving the similarity scores for the specified medicine
    distance = similarity[medicine_index]
    
    # Creating a sorted list of medicine indices and their similarity scores, sorted in descending order
    # Excluding the first item (itself), and selecting the top 6 most similar medicines
    medicine_list = sorted(list(enumerate(distance)), reverse=True, key=lambda x: x[1])[1:7]
    
    # Looping through the top 6 recommended medicines and printing their names
    for i in medicine_list:
        print(new_df.iloc[i[0]].Drug_Name)

In [81]:
recommendation("ACGEL CL NANO Gel 15gm")

ACGEL NANO Gel 15gm
Acnehit Gel 15gm
Acnelak Soap 75gm
Acnetor AD 1% Ointment 15gm
Acnetor AD Cream 15Acnetor AD Gel 15gm
Acnoff Anti Acne Bar 100gm


In [82]:
recommendation("A CN Gel(Topical) 20gmA CN Soap 75gm") 

Acnedap Gel 15gm
Acnetoin 20mg Capsule 10'SAcnetoin Gel 15gm
Acnin Pimple Care Face Pack 50gm
Adapnil Gel 15gm
Alene Gel 15gm
Atret 0.5% Cream 30gmATRET 0.025% Cream 30gm


Aclene 0.10% Gel 15gm
Acnay Gel 10gm
Acnelak Clz Cream 15gm
Acnelak Z Lotion 15gm
Acnemoist Cream 60gm
Acnewar Gel 15gm


In [91]:
recommendation("Wound Fix Solution 100ml")

Betaseptic Ointment 15gmBetaseptic Solution 500mlBetaseptic Solution 30ml
Cadress Ointment 10gm
Drez Gargle Syrup 100mlDrez 10% Solution 60mlDrez Solution 100mlDrez S Powder 10gmDrez Solution 500mlDrez Spray 40gmDrez 10% Ointment 15gm
Eusol Solution 100ml
Hydrogen Peroxide(Cymer) Solution 100ml
Intadine HS 10% Solution 100ml


In [94]:
import pickle
pickle.dump(new_df.to_dict(), open("model/medicine_dictionary.pkl", "wb"))
pickle.dump(similarity, open("model/similarity.pkl", "wb"))