In [1]:
 import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("medlr_assignment_dataset.csv")

# Display basic information
print("Initial Dataset Info:")
df.info()

# Display missing values count
print("\nMissing Values Before Cleaning:")
print(df.isnull().sum())

# Standardize Column Names (Lowercase & Underscore Format)
df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_")

# Handling Inconsistencies in Medicine Names
df["name"] = df["name"].str.lower().str.strip()  # Convert to lowercase and remove extra spaces
df["name"] = df["name"].str.replace(r"[^a-zA-Z0-9 ]", "", regex=True)  # Remove special characters

# Standardize Manufacturer Names
df["manufacturer"] = df["manufacturer"].str.lower().str.strip()

# Standardizing Dosage and Quantity Formats
df["quantity"] = df["quantity"].str.lower().str.strip().replace({"tablet\(s\)": "tablets", "strip": "strips"}, regex=True)

# Handle Missing Data
# Fill missing numerical values with median
for col in ["retail_price", "discounted_price"]:
    df[col] = df[col].fillna(df[col].median())

# Fill missing categorical values with mode (most frequent value)
for col in ["manufacturer", "quantity", "packaging_form"]:
    df[col] = df[col].fillna(df[col].mode()[0])

# Drop duplicates
df.drop_duplicates(inplace=True)

# Display missing values count after cleaning
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# Save cleaned dataset
df.to_csv("cleaned_medicine_dataset.csv", index=False)

# Display first 5 rows of cleaned dataset
df.head()  

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1448 entries, 0 to 1447
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1448 non-null   object 
 1   name                   1448 non-null   object 
 2   source                 1448 non-null   object 
 3   prescription_required  1448 non-null   bool   
 4   retail_price           1367 non-null   float64
 5   discounted_price       1384 non-null   float64
 6   manufacturer           1446 non-null   object 
 7   quantity               1184 non-null   object 
 8   packaging_form         931 non-null    object 
 9   salts                  1389 non-null   object 
dtypes: bool(1), float64(2), object(7)
memory usage: 103.4+ KB

Missing Values Before Cleaning:
id                         0
name                       0
source                     0
prescription_required      0
retail_price              81
discou

Unnamed: 0,id,name,source,prescription_required,retail_price,discounted_price,manufacturer,quantity,packaging_form,salts
0,source_6-75716,dolo 1gm tablet,source_6,False,45.58,40.11,micro labs,10 tablets in strips,STRIP,Paracetamol / Acetaminophen(1.0 G)
1,source_2-39579,dolo 1000mg infusion,source_2,True,268.0,219.76,micro labs ltd,10 tablets in strips,STRIP,Paracetamol/Acetaminophen 1000mg
2,source_7-90343,dolo 1000mg tablet,source_7,False,45.58,35.1,micro labs ltd,10.0 tablets in 1 strips,STRIP,Paracetamol (1000mg)
3,source_1-65076,dolo 1000 mg tablet 10s,source_1,True,45.5,40.0,micro labs ltd,10,Strip | Tablet,PARACETAMOL-1000MG
4,source_1-65077,dolo 120 mg suspension 60 ml,source_1,False,40.0,35.2,micro labs ltd,60,Bottle | Suspension,PARACETAMOL-120MG


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load the cleaned dataset
df = pd.read_csv("cleaned_medicine_dataset.csv")

# Combine relevant text columns to form a feature set
df["combined_features"] = df["salts"].astype(str) + " " + df["quantity"].astype(str) + " " + df["packaging_form"].astype(str)

# Convert text to numerical vectors using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["combined_features"])

# Apply K-Means Clustering
num_clusters = 10  # Choose based on dataset size
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(X)

# Save clustered results
df.to_csv("clustered_medicine_dataset.csv", index=False)

# Display cluster-wise samples
for cluster_num in range(num_clusters):
    print(f"\nCluster {cluster_num}:")
    print(df[df["cluster"] == cluster_num][["name", "salts", "quantity", "packaging_form"]].head(5))




Cluster 0:
                    name                                salts  \
0        dolo 1gm tablet   Paracetamol / Acetaminophen(1.0 G)   
1   dolo 1000mg infusion     Paracetamol/Acetaminophen 1000mg   
2     dolo 1000mg tablet                 Paracetamol (1000mg)   
6    dolo 120 suspension  Paracetamol/Acetaminophen 120mg/5ml   
18       dolo 500 tablet                    Paracetamol 500mg   

                    quantity packaging_form  
0       10 tablets in strips          STRIP  
1       10 tablets in strips          STRIP  
2   10.0 tablets in 1 strips          STRIP  
6       10 tablets in strips          STRIP  
18      10 tablets in strips          STRIP  

Cluster 1:
                           name  \
60                 dolot tablet   
61             dolot tablet 10s   
86  dolobak strip of 10 tablets   
87               dolobak tablet   
88           dolobak tablet 10s   

                                                salts              quantity  \
60            Trama

In [3]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("cleaned_medicine_dataset.csv")

# Select a small sample manually (modify the condition as needed)
sample_df = df[df["name"].str.contains("Dolo|Paracetamol", case=False, na=False)].head(15)  # Example selection

# Manually assign cluster IDs based on salts, quantity, and packaging form
# Example logic: If salts and dosage match, assign same cluster
sample_df["cluster_id"] = sample_df.groupby(["salts", "quantity", "packaging_form"]).ngroup()

# Save sample clustered dataset
sample_df.to_csv("sample_clustered_dataset.csv", index=False)

# Display the sample dataset
print(sample_df[["id", "name", "salts", "quantity", "packaging_form", "cluster_id"]])


                id                          name  \
0   source_6-75716               dolo 1gm tablet   
1   source_2-39579          dolo 1000mg infusion   
2   source_7-90343            dolo 1000mg tablet   
3   source_1-65076       dolo 1000 mg tablet 10s   
4   source_1-65077  dolo 120 mg suspension 60 ml   
5   source_6-75717    dolo 120mg suspension 60ml   
6   source_2-39580           dolo 120 suspension   
7    source_3-2151           dolo 156mg syp 60ml   
8   source_7-90345       dolo 15625mg suspension   
9   source_1-65079     dolo 15625 mg syrup 60 ml   
10  source_6-75718    dolo 250mg suspension 60ml   
11   source_3-2152           dolo 250mg syp 60ml   
12  source_7-90346      dolo 250 oral suspension   
13  source_1-65080    dolo 250 suspension 100 ml   
14  source_1-65081     dolo 250 suspension 60 ml   

                                        salts                   quantity  \
0          Paracetamol / Acetaminophen(1.0 G)       10 tablets in strips   
1            Pa

In [4]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, adjusted_rand_score

# Load the manually labeled dataset (true labels)
manual_df = pd.read_csv("manual_labeled_clusters.csv")

# Load the clustered dataset (predicted clusters)
clustered_df = pd.read_csv("clustered_medicine_dataset.csv")

# Ensure both datasets have the necessary columns
if "id" in manual_df.columns and "id" in clustered_df.columns:
    merged_df = manual_df.merge(clustered_df[['id', 'cluster']], on="id", how="inner")
elif "name" in manual_df.columns and "name" in clustered_df.columns:
    merged_df = manual_df.merge(clustered_df[['name', 'cluster']], on="name", how="inner")
else:
    raise ValueError("Matching column ('id' or 'name') not found in both datasets.")

# Rename columns for clarity
merged_df.rename(columns={"true_cluster": "y_true", "cluster": "y_pred"}, inplace=True)

# Convert cluster labels to categorical type (if they are not numeric)
merged_df["y_true"] = merged_df["y_true"].astype(str)
merged_df["y_pred"] = merged_df["y_pred"].astype(str)

# Compute clustering accuracy metrics
rand_index = adjusted_rand_score(merged_df["y_true"], merged_df["y_pred"])
precision = precision_score(merged_df["y_true"], merged_df["y_pred"], average="macro", zero_division=0)
recall = recall_score(merged_df["y_true"], merged_df["y_pred"], average="macro", zero_division=0)
f1 = f1_score(merged_df["y_true"], merged_df["y_pred"], average="macro", zero_division=0)

# Display results
print("\n--- Clustering Accuracy Metrics ---")
print(f"Adjusted Rand Index (ARI): {rand_index:.2f} (Best if close to 1)")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")



--- Clustering Accuracy Metrics ---
Adjusted Rand Index (ARI): 1.00 (Best if close to 1)
Precision: nan
Recall: nan
F1 Score: nan
