In [1]:
import pandas as pd

In [2]:
file_path = "Medicine_description.csv"

In [3]:
df = pd.read_csv(file_path)

In [4]:
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22481 entries, 0 to 22480
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Drug_Name    22481 non-null  object
 1   Reason       22481 non-null  object
 2   Description  22467 non-null  object
dtypes: object(3)
memory usage: 527.0+ KB


(None,
                                            Drug_Name Reason  \
 0               A CN Gel(Topical) 20gmA CN Soap 75gm   Acne   
 1  A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...   Acne   
 2                             ACGEL CL NANO Gel 15gm   Acne   
 3                                ACGEL NANO Gel 15gm   Acne   
 4                              Acleen 1% Lotion 25ml   Acne   
 
                                          Description  
 0                      Mild to moderate acne (spots)  
 1  A RET 0.025% is a prescription medicine that i...  
 2  It is used to treat acne vulgaris in people 12...  
 3  It is used to treat acne vulgaris in people 12...  
 4  treat the most severe form of acne (nodular ac...  )

In [5]:
# Data Cleaning
df.drop_duplicates(inplace=True)  
df.dropna(subset=["Description"], inplace=True) 

In [6]:
# Convert text to lowercase for consistency
df["Drug_Name"] = df["Drug_Name"].str.lower().str.strip()
df["Reason"] = df["Reason"].str.lower().str.strip()
df["Description"] = df["Description"].str.lower().str.strip()

In [7]:
# Checking the cleaned dataset
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 22467 entries, 0 to 22480
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Drug_Name    22467 non-null  object
 1   Reason       22467 non-null  object
 2   Description  22467 non-null  object
dtypes: object(3)
memory usage: 702.1+ KB


(None,
                                            Drug_Name Reason  \
 0               a cn gel(topical) 20gma cn soap 75gm   acne   
 1  a ret 0.05% gel 20gma ret 0.1% gel 20gma ret 0...   acne   
 2                             acgel cl nano gel 15gm   acne   
 3                                acgel nano gel 15gm   acne   
 4                              acleen 1% lotion 25ml   acne   
 
                                          Description  
 0                      mild to moderate acne (spots)  
 1  a ret 0.025% is a prescription medicine that i...  
 2  it is used to treat acne vulgaris in people 12...  
 3  it is used to treat acne vulgaris in people 12...  
 4  treat the most severe form of acne (nodular acne)  )

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [9]:
# Encode the target variable (Reason) as it is categorical
label_encoder = LabelEncoder()
df["Reason_Encoded"] = label_encoder.fit_transform(df["Reason"])

In [10]:
# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["Description"]).toarray()
y = df["Reason_Encoded"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Split the dataset into training and testing sets (80% train, 20% test)

In [12]:
# Check the shape of the data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17973, 910), (4494, 910), (17973,), (4494,))

In [13]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.svm import SVC

In [14]:
# Define Crayfish Optimization Algorithm (COA) for hyperparameter tuning of SVM
class CrayfishOptimization:
    def __init__(self, population_size, max_iter, X_train, y_train, X_test, y_test):
        self.population_size = population_size 
        self.max_iter = max_iter 
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.population = self.initialize_population()

    def initialize_population(self):
        # Initialize random hyperparameters: C (1-100) and gamma (0.001-1) for SVM
        return np.array([[np.random.uniform(1, 100), np.random.uniform(0.001, 1)] for _ in range(self.population_size)])

    def fitness(self, params):
        C, gamma = params
        model = SVC(C=C, gamma=gamma, kernel="rbf") 
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        return accuracy_score(self.y_test, y_pred) 

    def optimize(self):
        best_solution = None
        best_fitness = 0
        for iteration in range(self.max_iter):
            for i in range(self.population_size):
                new_solution = self.population[i] + np.random.uniform(-1, 1, 2)  
                new_solution = np.clip(new_solution, [1, 0.001], [100, 1])  
                new_fitness = self.fitness(new_solution)

                # Update the best solution if the new one is better
                if new_fitness > best_fitness:
                    best_fitness = new_fitness
                    best_solution = new_solution

            self.population += np.random.uniform(-0.5, 0.5, (self.population_size, 2))  

        return best_solution


In [50]:
# Select a sample from the dataset to improve efficiency
sample_size = 1000
if len(df) > sample_size:
    df_sample = df.sample(n=sample_size, random_state=42)
else:
    df_sample = df


In [51]:
# Prepare the sampled dataset for training
X_sample = vectorizer.transform(df_sample["Description"]).toarray()
y_sample = label_encoder.transform(df_sample["Reason"])
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

In [52]:
# Run Crayfish Optimization Algorithm on the sampled data
coa = CrayfishOptimization(population_size=5, max_iter=10, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
best_params = coa.optimize()

In [53]:
# Train the final SVM model with the best parameters
best_C, best_gamma = best_params
svm_model = SVC(C=best_C, gamma=best_gamma, kernel="rbf")
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

In [54]:
# Calculate evaluation metrics in percentage form
precision = precision_score(y_test, y_pred, average="weighted",zero_division=1) * 100
recall = recall_score(y_test, y_pred, average="weighted") * 100
f1 = f1_score(y_test, y_pred, average="weighted") * 100
accuracy = accuracy_score(y_test, y_pred) * 100

In [55]:
# Display the final evaluation metrics
metrics = {
    "Precision (%)": precision,
    "Recall (%)": recall,
    "F1-Score (%)": f1,
    "Accuracy (%)": accuracy,
    "Best C": best_C,
    "Best Gamma": best_gamma
}

print(metrics)

{'Precision (%)': 97.29166666666666, 'Recall (%)': 96.5, 'F1-Score (%)': 95.29052628649355, 'Accuracy (%)': 96.5, 'Best C': np.float64(56.469786431290785), 'Best Gamma': np.float64(0.028619401541105538)}
