# Decision Tree Classifier

Berikut ini merupakan implementasi algoritma Decision Tree CLassifier menggunakan library scikit-learn.

---
## Setup Library dan Dataset

In [88]:
import pandas as pd
from sklearn import tree
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer(as_frame = True)
full_data_X, full_data_Y = load_breast_cancer(return_X_y = True, as_frame=True)
X_train_full, X_valid_full, y_train, y_valid = train_test_split(full_data_X, full_data_Y, 
                                                                train_size=0.8, test_size=0.2)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

X_train_full.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
291,14.96,19.1,97.03,687.3,0.08992,0.09823,0.0594,0.04819,0.1879,0.05852,...,16.25,26.19,109.1,809.8,0.1313,0.303,0.1804,0.1489,0.2962,0.08472
153,11.15,13.08,70.87,381.9,0.09754,0.05113,0.01982,0.01786,0.183,0.06105,...,11.99,16.3,76.25,440.8,0.1341,0.08971,0.07116,0.05506,0.2859,0.06772
490,12.25,22.44,78.18,466.5,0.08192,0.052,0.01714,0.01261,0.1544,0.05976,...,14.17,31.99,92.74,622.9,0.1256,0.1804,0.123,0.06335,0.31,0.08203
386,12.21,14.09,78.78,462.0,0.08108,0.07823,0.06839,0.02534,0.1646,0.06154,...,13.13,19.29,87.65,529.9,0.1026,0.2431,0.3076,0.0914,0.2677,0.08824
17,16.13,20.68,108.1,798.8,0.117,0.2022,0.1722,0.1028,0.2164,0.07356,...,20.96,31.48,136.8,1315.0,0.1789,0.4233,0.4784,0.2073,0.3706,0.1142


---
## Preprocessing and Pipelining

In [89]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing for numerical data
numerical_transformer = SimpleImputer()

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('onehot', OneHotEncoder())
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

---
## Create model

In [90]:
from sklearn import tree
# Define model
model = tree.DecisionTreeClassifier()

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

---
## Fitting and predict

In [93]:
# Preprocessing of training data, fit model 
clf.fit(X_train_full, y_train)

r = tree.export_text(model, feature_names = [d for d in data.feature_names])
print(r)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid_full)

|--- worst area <= 874.85
|   |--- worst concave points <= 0.16
|   |   |--- worst concave points <= 0.14
|   |   |   |--- area error <= 48.70
|   |   |   |   |--- worst texture <= 29.23
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- worst texture >  29.23
|   |   |   |   |   |--- fractal dimension error <= 0.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- fractal dimension error >  0.00
|   |   |   |   |   |   |--- worst texture <= 29.35
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- worst texture >  29.35
|   |   |   |   |   |   |   |--- perimeter error <= 1.47
|   |   |   |   |   |   |   |   |--- radius error <= 0.22
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- radius error >  0.22
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- perimeter error >  1.47
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |--- area error >  48.70
|   |   |   |   |--- are

---
## Accuracy and F1 Score

In [92]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_valid, preds)
f1 = f1_score(y_valid, preds)
score_data = {'accuracy': accuracy, 'f1': f1}
score = pd.DataFrame(data = score_data, index=['score'])
score

Unnamed: 0,accuracy,f1
score,0.894737,0.910448
