In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [2]:
# Load the dataset (you can adjust the file path as needed)
medicines = pd.read_csv('/content/medicine.csv')

# Check the first few rows of the dataset
medicines.head()

Unnamed: 0,index,Drug_Name,Reason,Description
0,1,A CN Gel(Topical) 20gmA CN Soap 75gm,Acne,Mild to moderate acne (spots)
1,2,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,Acne,A RET 0.025% is a prescription medicine that i...
2,3,ACGEL CL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...
3,4,ACGEL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...
4,5,Acleen 1% Lotion 25ml,Acne,treat the most severe form of acne (nodular ac...


In [3]:
# Preprocess the Drug_Name column
medicines['Drug_Name'] = medicines['Drug_Name'].str.lower()

# Remove special characters or unnecessary symbols (optional)
medicines['Drug_Name'] = medicines['Drug_Name'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)


In [4]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the Drug_Name column
tfidf_matrix = tfidf.fit_transform(medicines['Drug_Name'])

# Calculate cosine similarity
similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Save the similarity matrix to use in your Streamlit app
pickle.dump(similarity, open('similarity.pkl', 'wb'))


In [8]:
# Create a recommendation function
def recommend(medicine_name, medicines_df, similarity_matrix):
    # Find the index of the medicine in the dataframe
    try:
        idx = medicines_df[medicines_df['Drug_Name'] == medicine_name.lower()].index[0]
    except IndexError:
        return "Medicine not found in the dataset."

    # Get the similarity scores for this medicine
    sim_scores = list(enumerate(similarity_matrix[idx]))

    # Sort the medicines by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]  # Exclude the first one (it's the medicine itself)

    # Get the recommended medicines
    recommended_medicines = [medicines_df.iloc[i[0]].Drug_Name for i in sim_scores]

    return recommended_medicines

# Test the recommendation function
recommend('Acnehit Gel 15gm', medicines, similarity)

['acnelak z gel 15gm',
 'dolonex gel 15gm',
 'atm a gel 15gm',
 'hydroheal am gel 50gmhydroheal am gel 15gm',
 'hydroheal am gel 50gmhydroheal am gel 15gm']

In [9]:
# Convert the medicines DataFrame to a dictionary and save it
medicine_dict = medicines.to_dict()
pickle.dump(medicine_dict, open('medicine_dict.pkl', 'wb'))

# The similarity matrix has already been saved as 'similarity.pkl'

In [10]:
from google.colab import files

# Download similarity.pkl
files.download('similarity.pkl')

# Download medicine_dict.pkl
files.download('medicine_dict.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>