# Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

import warnings 
warnings.simplefilter(action='ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the dataset

In [4]:
df= pd.read_csv(r"C:\Users\Naveen\Desktop\INTERSHIP\files\DATA FILE\19.09.2024\drug200.csv")
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [5]:
df.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [7]:
df.describe()

Unnamed: 0,Age,Na_to_K
count,200.0,200.0
mean,44.315,16.084485
std,16.544315,7.223956
min,15.0,6.269
25%,31.0,10.4455
50%,45.0,13.9365
75%,58.0,19.38
max,74.0,38.247


In [8]:
df.describe(include=[object])

Unnamed: 0,Sex,BP,Cholesterol,Drug
count,200,200,200,200
unique,2,3,2,5
top,M,HIGH,HIGH,drugY
freq,104,77,103,91


In [9]:
df.shape

(200, 6)

In [10]:
df.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

# Initialize label encoder

In [12]:
le = LabelEncoder()

# Encode categorical columns

In [14]:
df['Sex'] = le.fit_transform(df['Sex'])
df['BP'] = le.fit_transform(df['BP'])
df['Cholesterol'] = le.fit_transform(df['Cholesterol'])

# Split data into features and target

In [16]:
X = df.drop('Drug', axis=1)
y = df['Drug']

# Initialize SMOTE

In [18]:
sm = SMOTE(random_state=42)

NameError: name 'SMOTE' is not defined

# Apply SMOTE to features and target

In [None]:
X_res, y_res = sm.fit_resample(X, y)

# Split data into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

# Initialize and train the decision tree

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Make predictions

In [None]:
y_pred = dt.predict(X_test)

# Evaluate the model

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Define the parameter grid

In [None]:
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV

In [None]:
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the model

In [None]:
grid_search.fit(X_train, y_train)

# Print the best parameters

In [None]:
print("Best parameters found:", grid_search.best_params_)

# Evaluate tuned model

In [None]:
best_dt = grid_search.best_estimator_
y_pred_tuned = best_dt.predict(X_test)
print("Tuned Accuracy:", accuracy_score(y_test, y_pred_tuned))
print("Tuned Classification Report:\n", classification_report(y_test, y_pred_tuned))

In [None]:
plt.figure(figsize=(8,6))
sn.countplot(x='Sex', data=df)
plt.title('Drug')

In [None]:
num_features=[]
for i in df.select_dtypes(include=['int64','float64']).columns:
    num_features.append(i)
num_features

In [None]:
obj_features=[]
for i in df.select_dtypes(include=['object']).columns:
    obj_features.append(i)
obj_features

In [None]:
df[num_features].hist(figsize=(15, 12), bins=30, edgecolor='black')
plt.suptitle('Histograms of Numerical Features')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
for i, feature in enumerate(num_features):
    plt.subplot(2, 5,i+1)
    sn.boxplot(x='Drug', y=feature, data=df)
    plt.title(f'Boxplot of {feature}')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12,10))
correlation_matrix = df[num_features].corr()
sn.heatmap(correlation_matrix, annot=True, )
plt.title('Correlation Heatmap')

In [None]:
for column in df.select_dtypes(include='object').columns:
    print(f'{column} unique values:')
    print(df[column].value_counts())
    print('***********')
    print('***********')

In [None]:
for col in df.columns:
    print(df[col].value_counts())
    print('****************')