In [1]:
import numpy as np
import pandas as pd

In [2]:
description_data = pd.read_csv("track.csv")

In [3]:
print(description_data.head())
print(description_data.info())
print(description_data.isnull().sum())

                                           Drug_Name Reason  \
0               A CN Gel(Topical) 20gmA CN Soap 75gm   Acne   
1  A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...   Acne   
2                             ACGEL CL NANO Gel 15gm   Acne   
3                                ACGEL NANO Gel 15gm   Acne   
4                              Acleen 1% Lotion 25ml   Acne   

                                         Description  
0                      Mild to moderate acne (spots)  
1  A RET 0.025% is a prescription medicine that i...  
2  It is used to treat acne vulgaris in people 12...  
3  It is used to treat acne vulgaris in people 12...  
4  treat the most severe form of acne (nodular ac...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22481 entries, 0 to 22480
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Drug_Name    22481 non-null  object
 1   Reason       22481 non-null  object
 2   Description  2

In [4]:
# Drop rows where Description is missing
description_data = description_data.dropna(subset=['Description'])

In [5]:
import re
import nltk

# Download required NLTK resources if you haven't already
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()                          # Lowercase
    text = re.sub(r'[^\w\s]', '', text)               # Remove punctuation
    tokens = text.split()                             # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]       # Lemmatize
    return ' '.join(tokens)

# Apply cleaning to the Description column
description_data['Description_Clean'] = description_data['Description'].apply(clean_text)

# Optionally, clean Drug_Name as well
description_data['Drug_Name_Clean'] = description_data['Drug_Name'].str.lower().apply(lambda x: re.sub(r'[^\w\s]', '', x))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\palak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\palak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer. You can adjust max_features as needed.
vectorizer = TfidfVectorizer(max_features=500)

# Fit and transform the cleaned Description text
tfidf_matrix = vectorizer.fit_transform(description_data['Description_Clean'])

# Convert the sparse matrix to a dense NumPy array (if needed)
X_tfidf = tfidf_matrix.toarray()

# Print the shape to verify dimensions
print("TF-IDF Matrix shape:", X_tfidf.shape)


TF-IDF Matrix shape: (22467, 500)


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_labels= encoder.fit_transform(description_data['Reason'])

In [8]:
from sklearn.preprocessing import LabelEncoder

# Encode the drug names (target variable)
drug_encoder = LabelEncoder()
y_drug = drug_encoder.fit_transform(description_data['Drug_Name'])

# Verify the target classes (drug names)
print("Unique drugs:", drug_encoder.classes_)
print("First 10 encoded drug labels:", y_drug[:10])


Unique drugs: ["1 AL M 10/5mg Tablet 10'S"
 "1 AL Plus Capsule 10'S1 AL 10mg Tablet 15'S1 AL 5mg Tablet 10'S1 AL Max Tablet 10'S1 AL Syrup 30ml"
 "1000 Para 1000mg Tablet 12'S" ...
 "Zyvana M 1mg Tablet 10'SZyvana M 2mg Tablet 10'S"
 "Zyvana M2 Forte Tablet 10'S"
 "Zyven OD 25mg Tablet 10'SZyven OD 50mg Tablet 10'SZyven OD 100mg Tablet 10'S"]
First 10 encoded drug labels: [ 39  40 102 104 483 484 498 499 500 501]


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [10]:
def objective_function(solution):
    """
    Evaluate a candidate solution (binary vector) for feature selection.
    solution: binary vector of length equal to the number of features.
    
    Returns:
    fitness: a value that combines classification error and a penalty for
             the number of features selected. Lower is better.
    """
    # Determine which features are selected
    selected_features = [i for i, bit in enumerate(solution) if bit == 1]
    
    # If no features are selected, return a high fitness value (bad solution)
    if len(selected_features) == 0:
        return 1.0  # worst case
    
    # Select features from the TF-IDF matrix
    X_selected = X_tfidf[:, selected_features]
    
    # Use a Random Forest classifier and 5-fold cross-validation to evaluate
    classifier = RandomForestClassifier(random_state=42)
    scores = cross_val_score(classifier, X_selected, y_labels, cv=4, scoring='accuracy')
    error_rate = 1 - np.mean(scores)
    
    # Penalty: encourage fewer features. For example, add the ratio of selected features.
    penalty = len(selected_features) / X_tfidf.shape[1]
    
    # Total fitness is error rate plus penalty (you can adjust the weights if needed)
    fitness = error_rate + penalty
    return fitness

# Test the objective function with a random binary vector
num_features = X_tfidf.shape[1]
test_solution = np.random.randint(0, 2, size=num_features)
print("Test fitness:", objective_function(test_solution))


Test fitness: 0.6151021324028044


In [11]:
def grey_wolf_optimization(objective_func, num_features, num_wolves=10, max_iter=7):
    """
    Perform Grey Wolf Optimization (GWO) for feature selection.
    
    Parameters:
    - objective_func: function to evaluate the fitness of a candidate solution.
    - num_features: number of features (length of binary vector).
    - num_wolves: number of candidate solutions (wolves).
    - max_iter: maximum number of iterations.
    
    Returns:
    - alpha: best feature subset (binary vector).
    - best_fitness: fitness value of the best solution.
    """
    # Initialize the population: each wolf is a binary vector of size num_features.
    positions = np.random.randint(0, 2, size=(num_wolves, num_features))
    
    # Evaluate fitness for each wolf
    fitness = np.array([objective_func(positions[i]) for i in range(num_wolves)])
    
    # Identify the best three wolves: alpha (best), beta (second best), delta (third best)
    idx_sorted = np.argsort(fitness)  # lower fitness is better
    alpha = positions[idx_sorted[0]].copy()
    beta = positions[idx_sorted[1]].copy()
    delta = positions[idx_sorted[2]].copy()
    best_fitness = fitness[idx_sorted[0]]
    
    # Main optimization loop
    for iter in range(max_iter):
        # Parameter a decreases linearly from 2 to 0
        a = 2 - iter * (2 / max_iter)
        
        # Update the position of each wolf in the population
        for i in range(num_wolves):
            for j in range(num_features):
                # Update based on alpha
                r1, r2 = np.random.random(), np.random.random()
                A1 = 2 * a * r1 - a
                C1 = 2 * r2
                D_alpha = abs(C1 * alpha[j] - positions[i, j])
                X1 = alpha[j] - A1 * D_alpha
                
                # Update based on beta
                r1, r2 = np.random.random(), np.random.random()
                A2 = 2 * a * r1 - a
                C2 = 2 * r2
                D_beta = abs(C2 * beta[j] - positions[i, j])
                X2 = beta[j] - A2 * D_beta
                
                # Update based on delta
                r1, r2 = np.random.random(), np.random.random()
                A3 = 2 * a * r1 - a
                C3 = 2 * r2
                D_delta = abs(C3 * delta[j] - positions[i, j])
                X3 = delta[j] - A3 * D_delta
                
                # Combine the three influences
                new_position = (X1 + X2 + X3) / 3
                
                # Convert continuous value to binary using a threshold (0.5)
                positions[i, j] = 1 if new_position > 0.5 else 0
        
        # Evaluate fitness of updated positions
        fitness = np.array([objective_func(positions[i]) for i in range(num_wolves)])
        idx_sorted = np.argsort(fitness)
        
        # Update alpha, beta, delta if better solutions are found
        if fitness[idx_sorted[0]] < best_fitness:
            best_fitness = fitness[idx_sorted[0]]
            alpha = positions[idx_sorted[0]].copy()
            beta = positions[idx_sorted[1]].copy()
            delta = positions[idx_sorted[2]].copy()
        
        print(f"Iteration {iter+1}/{max_iter}, Best Fitness: {best_fitness}")
    
    return alpha, best_fitness

# Run the GWO algorithm
best_solution, best_fitness = grey_wolf_optimization(objective_function, num_features,
                                                     num_wolves=10, max_iter=7)
print("Best Feature Subset (binary vector):")
print(best_solution)
print("Best Fitness Achieved:")
print(best_fitness)


Iteration 1/7, Best Fitness: 0.5168730724723025
Iteration 2/7, Best Fitness: 0.49028274780922987
Iteration 3/7, Best Fitness: 0.4548265483749728
Iteration 4/7, Best Fitness: 0.44333099135104204
Iteration 5/7, Best Fitness: 0.4220876071546136
Iteration 6/7, Best Fitness: 0.41733092002452876
Iteration 7/7, Best Fitness: 0.41733092002452876
Best Feature Subset (binary vector):
[1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0
 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0
 0 0 1 0 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0
 1 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1 0
 0 1 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 1 1 1 1 0 1 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0
 1 1 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 1 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0
 0 0 1 1 1 1 0 0 0 0 1 

In [12]:
# Identify the indices of the selected features from the best_solution binary vector
selected_features = [i for i, bit in enumerate(best_solution) if bit == 1]
print("Number of selected features:", len(selected_features))

# Subset the TF-IDF feature matrix using the selected features
X_selected = X_tfidf[:, selected_features]


Number of selected features: 180


In [13]:
from sklearn.model_selection import train_test_split

# Split data (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_labels, test_size=0.2, random_state=42)


In [14]:
from sklearn.ensemble import RandomForestClassifier

final_model = RandomForestClassifier(random_state=42)
final_model.fit(X_train, y_train)


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
import numpy as np

# Predict on the test set
y_pred = final_model.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("MSE: {:.4f}".format(mse))
print("RMSE: {:.4f}".format(rmse))


Accuracy: 0.9666
Precision: 0.9668
Recall: 0.9666
F1 Score: 0.9636
MSE: 11.6340
RMSE: 3.4109
