You are a data scientist working for a healthcare company, and you have been tasked with creating a
decision tree to help identify patients with diabetes based on a set of clinical variables. You have been
given a dataset (diabetes.csv) with the following variables:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('diabetes.csv')


In [None]:
print(df.info())


In [None]:
print(df.describe())


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("diabetes.csv")

# Check for missing values
print(df.isnull().sum())

# Check for outliers using box plots
fig, axs = plt.subplots(3, 3, figsize=(15,15))
axs = axs.flatten()

for i, col in enumerate(df.columns[:-1]):
    axs[i].boxplot(df[col])
    axs[i].set_title(col)
    
plt.show()


In [None]:
# Remove outliers using the IQR method
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]


In [None]:
# Transform the 'Pregnancies' variable into a categorical variable
bins = [0, 1, 3, 6, 9, np.inf]
labels = ['0', '1-3', '4-6', '7-9', '10+']
df['Pregnancies'] = pd.cut(df['Pregnancies'], bins=bins, labels=labels, include_lowest=True)

# Convert the categorical variables into dummy variables
df = pd.get_dummies(df, columns=['Pregnancies'], prefix=['Preg'])


In [None]:
# Standardize the numerical variables
cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
df[cols] = (df[cols] - df[cols].mean()) / df[cols].std()


In [None]:
# Separate the independent and dependent variables
X = df.drop('Outcome', axis=1)
y = df['Outcome']


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize a decision tree classifier
tree = DecisionTreeClassifier(random_state=42)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'max_depth': [3, 5, 7, 9, 11],
              'min_samples_split': [2, 5, 10, 20, 30],
              'min_samples_leaf': [1, 2, 3, 4, 5]}

# Perform grid search with cross-validation
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding accuracy score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best hyperparameters: {best_params}")
print(f"Best accuracy score: {best_score:.3f}")


In [None]:
# Train the final decision tree model with the best hyperparameters
tree = DecisionTreeClassifier(**best_params, random_state=42)
tree.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score

# Make predictions on the test set
y_pred = tree.predict(X_test)

# Compute performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Compute confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Compute ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, tree.predict_proba(X_test)[:, 1])
auc = roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1])

# Print the performance metrics
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")
print(f"Confusion matrix: [[{tn} {fp}], [{fn} {tp}]]")
print(f"AUC score: {auc:.3f}")


In [None]:
import matplotlib.pyplot as plt

# Plot ROC curve
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
