In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
)
from sklearn.feature_selection import SelectKBest, chi2

1a: Read the dataset and display the first 5 rows


In [22]:
dp = pd.read_csv("dataR2.csv")
print(dp.head())

   Age        BMI  Glucose  Insulin      HOMA   Leptin  Adiponectin  Resistin  \
0   48  23.500000       70    2.707  0.467409   8.8071     9.702400   7.99585   
1   83  20.690495       92    3.115  0.706897   8.8438     5.429285   4.06405   
2   82  23.124670       91    4.498  1.009651  17.9393    22.432040   9.27715   
3   68  21.367521       77    3.226  0.612725   9.8827     7.169560  12.76600   
4   86  21.111111       92    3.549  0.805386   6.6994     4.819240  10.57635   

     MCP.1  Classification  
0  417.114               1  
1  468.786               1  
2  554.697               1  
3  928.220               1  
4  773.920               1  


1b: Check for and handle missing values


In [23]:
# 1b: Check for and handle missing values
print(dp.isnull().sum())
dp = dp.dropna()
print(dp.isnull().sum())

Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64
Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64


1c: Convert labels to '0' (Healthy) and '1' (Patients)


In [24]:
dp["Classification"] = dp["Classification"].map({1: "Healthy", 2: "Patients"})

2: Split the dataset into 80% training and 20% testing


In [25]:
X = dp.drop("Classification", axis=1)
Y = dp["Classification"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

3. Train DT classifier  using built-in function on the training set  with default parameters (sklearn.tree.DecisionTreeClassifier)

In [26]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

4. Evaluate the train model using testset with the help of confusion matrix, Accuracy, Precision and Recall


In [27]:
y_pred = dt_classifier.predict(X_test)
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average="weighted"))
print("Recall: ", recall_score(y_test, y_pred, average="weighted"))

Confusion Matrix: 
 [[ 4  2]
 [ 6 12]]
Accuracy:  0.6666666666666666
Precision:  0.7428571428571429
Recall:  0.6666666666666666


5. Set the criteria as entropy and log_loss and train the model and evaluate it on testset.


In [28]:
dt_classifier = DecisionTreeClassifier(criterion="entropy", random_state=42)
dt_classifier.fit(X_train, y_train)

6. Parameter Tuning: a. Try with max_depth as [10, 100] b. Min_samples_split as [4, 6,8] c. max_features {“auto”, “sqrt”, “log2”}


In [29]:
from sklearn.model_selection import GridSearchCV

# define the parameter grid
param_grid = {
    "max_depth": [10, 100],
    "min_samples_split": [4, 6, 8],
    "max_features": ["auto", "sqrt", "log2"],
}

dt_classifier = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=dt_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2
)

grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


Best parameters:  {'max_depth': 10, 'max_features': 'auto', 'min_samples_split': 8}
Best score:  0.62046783625731




7. Compare the results and find the best suitable model.


In [30]:
# create a list of all the models
models = [
    {"name": "Decision Tree (gini)", "model": DecisionTreeClassifier(random_state=42)},
    {
        "name": "Decision Tree (entropy)",
        "model": DecisionTreeClassifier(criterion="entropy", random_state=42),
    },
    {"name": "Decision Tree (tuned)", "model": grid_search.best_estimator_},
]

# train and evaluate each model
for model in models:
    # train the model
    model["model"].fit(X_train, y_train)

    # evaluate the model
    y_pred = model["model"].predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")

    # print the results
    print(model["name"])
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print()

Decision Tree (gini)
Accuracy:  0.6666666666666666
Precision:  0.7428571428571429
Recall:  0.6666666666666666

Decision Tree (entropy)
Accuracy:  0.8333333333333334
Precision:  0.8333333333333334
Recall:  0.8333333333333334

Decision Tree (tuned)
Accuracy:  0.5
Precision:  0.5857142857142857
Recall:  0.5



