In [1]:
import pandas as pd

In [2]:
file_path = "flipkart_healthcare.csv"
df = pd.read_csv(file_path)

In [3]:
# Display basic information about the dataset
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           211 non-null    object 
 1   Ratings        211 non-null    float64
 2   no_ratings     211 non-null    int64  
 3   Selling Price  211 non-null    object 
 4   MRP            211 non-null    object 
 5   Discount       211 non-null    int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 10.0+ KB


(None,
                                                 Name  Ratings  no_ratings  \
 0  MUSCLEBLAZE Super Gainer XXL Weight Gainers/Ma...      4.0      179795   
 1  MUSCLEBLAZE Beginner's Whey Protein, No Added ...      4.2       60216   
 2  WOW Life Science Omega-3 1000mg Capsules with ...      4.4       20868   
 3  BIGMUSCLES NUTRITION Omega-3 Fish Oil Triple S...      4.3        8602   
 4                MUSCLEBLAZE Beginner's Whey Protein      4.2       60216   
 
   Selling Price    MRP  Discount  
 0           893  1,719        48  
 1         1,693  2,279        25  
 2           749    899        16  
 3           339    615        44  
 4           749    849        11  )

In [4]:
import numpy as np

In [5]:
# Convert 'Selling Price' and 'MRP' to numeric by removing commas
df["Selling Price"] = df["Selling Price"].str.replace(",", "").astype(float)
df["MRP"] = df["MRP"].str.replace(",", "").astype(float)

In [6]:
# Creating a new feature: Price Difference (MRP - Selling Price)
df["Price_Difference"] = df["MRP"] - df["Selling Price"]

In [7]:
# Creating a categorical target variable: High (1) if Discount > 20%, Low (0) otherwise
df["Discount_Category"] = np.where(df["Discount"] > 20, 1, 0)

In [8]:
# Selecting relevant features for classification
features = ["Ratings", "no_ratings", "Price_Difference", "Discount"]
X = df[features]
y = df["Discount_Category"]

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [10]:
# Normalize feature values for better optimization performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
# Check class distribution after splitting
y_train.value_counts(), y_test.value_counts()

(Discount_Category
 1    123
 0     45
 Name: count, dtype: int64,
 Discount_Category
 1    31
 0    12
 Name: count, dtype: int64)

In [13]:
# Implementing Crayfish Optimization Algorithm (COA) for SVM hyperparameter tuning
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [14]:
class CrayfishOptimization:
    def __init__(self, population_size, max_iter, X_train, y_train, X_test, y_test):
        self.population_size = population_size  
        self.max_iter = max_iter  
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.population = self.initialize_population()

    def initialize_population(self):
        # Initialize random hyperparameters: C (1-50) and gamma (0.001-1) for SVM
        return np.array([[np.random.uniform(1, 50), np.random.uniform(0.001, 1)] for _ in range(self.population_size)])

    def fitness(self, params):
        C, gamma = params
        model = SVC(C=C, gamma=gamma, kernel="rbf")  
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        return accuracy_score(self.y_test, y_pred) 

    def optimize(self):
        best_solution = None
        best_fitness = 0
        for iteration in range(self.max_iter):
            for i in range(self.population_size):
                new_solution = self.population[i] + np.random.uniform(-1, 1, 2)  
                new_solution = np.clip(new_solution, [1, 0.001], [50, 1]) 
                new_fitness = self.fitness(new_solution)

                # Update the best solution if the new one is better
                if new_fitness > best_fitness:
                    best_fitness = new_fitness
                    best_solution = new_solution

            self.population += np.random.uniform(-0.5, 0.5, (self.population_size, 2)) 

        return best_solution

In [15]:
# Run Crayfish Optimization Algorithm
coa = CrayfishOptimization(population_size=5, max_iter=10, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
best_params = coa.optimize()

In [16]:
# Train the final SVM model with the best parameters
best_C, best_gamma = best_params
svm_model = SVC(C=best_C, gamma=best_gamma, kernel="rbf")
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)