# Decision Tree Classifier

Berikut ini merupakan implementasi machine learning pada dataset playtennis menggunakan library scikit-learn.

---
## Setup Library dan Dataset

In [1]:
import pandas as pd
from sklearn import tree
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer(as_frame = True)
full_data_X, full_data_Y = load_breast_cancer(return_X_y = True, as_frame=True)
X_train_full, X_valid_full, y_train, y_valid = train_test_split(full_data_X, full_data_Y, 
                                                                train_size=0.8, test_size=0.2)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

X_train_full.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
345,10.26,14.71,66.2,321.6,0.09882,0.09159,0.03581,0.02037,0.1633,0.07005,...,10.88,19.48,70.89,357.1,0.136,0.1636,0.07162,0.04074,0.2434,0.08488
144,10.75,14.97,68.26,355.3,0.07793,0.05139,0.02251,0.007875,0.1399,0.05688,...,11.95,20.72,77.79,441.2,0.1076,0.1223,0.09755,0.03413,0.23,0.06769
275,11.89,17.36,76.2,435.6,0.1225,0.0721,0.05929,0.07404,0.2015,0.05875,...,12.4,18.99,79.46,472.4,0.1359,0.08368,0.07153,0.08946,0.222,0.06033
276,11.33,14.16,71.79,396.6,0.09379,0.03872,0.001487,0.003333,0.1954,0.05821,...,12.2,18.99,77.37,458.0,0.1259,0.07348,0.004955,0.01111,0.2758,0.06386
194,14.86,23.21,100.4,671.4,0.1044,0.198,0.1697,0.08878,0.1737,0.06672,...,16.08,27.78,118.6,784.7,0.1316,0.4648,0.4589,0.1727,0.3,0.08701


---
## Preprocessing and Pipelining

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing for numerical data
numerical_transformer = SimpleImputer()

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

---
## Create model

In [3]:
from sklearn import tree
import six
import sys
sys.modules['sklearn.externals.six'] = six
from id3 import Id3Estimator
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Define model
modelDtl = tree.DecisionTreeClassifier(random_state=0)
modelID3 = Id3Estimator()
kmeans = KMeans(n_clusters=2, random_state=0)
modelLogistic = LogisticRegression(random_state=0, max_iter=100)
modelNeural = MLPClassifier(random_state=0, max_iter=300)
modelSVM = make_pipeline(StandardScaler(), SVC(gamma='auto', random_state=0))

# Bundle preprocessing and modeling code in a pipeline
dtl = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', modelDtl)
                     ])

modID3 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', modelID3)
                     ])

modelKmeans = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', kmeans)
                     ])

modlogistic = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', modelLogistic)
                     ])

modNeural = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', modelNeural)
                     ])

modSVM = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', modelSVM)
                     ])

---
## Fitting and predict

### Decision Tree Learning

In [4]:
import graphviz
# Preprocessing of training data, fit model 
dtl.fit(X_train_full, y_train)
#r = tree.export_text(model, feature_names = [d for d in data.feature_names])
dot_data = tree.export_graphviz(modelDtl, 
                  feature_names=dtl.named_steps['preprocessor'].transformers_[1][1]\
   .named_steps['onehot'].get_feature_names(X_train_full.columns))
graph = graphviz.Source(dot_data)

# Preprocessing of validation data, get predictions
predsDtl = dtl.predict(X_valid_full)
graph

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

### ID3 Estimator

In [None]:
from id3 import export_graphviz

# Preprocessing of training data, fit model 
modID3.fit(X_train_full, y_train)

#r = tree.export_text(model, feature_names = [d for d in data.feature_names])
dot_treeID3 = export_graphviz(modelID3.tree_, 'graph.dot', 
                  feature_names=dtl.named_steps['preprocessor'].transformers_[1][1]\
   .named_steps['onehot'].get_feature_names(X_train_full.columns), 
                  )
with open("graph.dot") as f:
    dot_graphID3 = f.read()
graphID3 = graphviz.Source(dot_graphID3)

# Preprocessing of validation data, get predictions
predsID3 = modID3.predict(X_valid_full)

graphID3

### K-Means

In [None]:
modelKmeans.fit(X_train_full, y_train)

predsKmeans = ['Yes' if item==1 else 'No' for item in modelKmeans.predict(X_valid_full)]
predsKmeans

### Logistic Regression

In [None]:
modlogistic.fit(X_train_full, y_train)

predsLogistic = modlogistic.predict(X_valid_full)
predsLogistic

### Neural Network

In [None]:
modNeural.fit(X_train_full, y_train)
predsNeural = modNeural.predict(X_valid_full)
predsNeural

### SVM

In [None]:
modSVM.fit(X_train_full, y_train)
predsSVM = modSVM.predict(X_valid_full)
predsSVM

---
## Accuracy and F1 Score

In [None]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = [accuracy_score(y_valid, predsDtl),accuracy_score(y_valid, predsID3),accuracy_score(y_valid, predsKmeans),accuracy_score(y_valid, predsLogistic),accuracy_score(y_valid, predsNeural),accuracy_score(y_valid, predsSVM)]
f1 = [f1_score(y_valid, predsDtl, pos_label='Yes'), f1_score(y_valid, predsID3, pos_label='Yes'), f1_score(y_valid, predsKmeans, pos_label='Yes'), f1_score(y_valid, predsLogistic, pos_label='Yes'), f1_score(y_valid, predsNeural, pos_label='Yes'), f1_score(y_valid, predsSVM, pos_label='Yes')]
score_data = {'accuracy': accuracy, 'f1': f1}
score = pd.DataFrame(data = score_data, index=['Decision Tree Learning', 'ID3 Estimator', 'K-Means', 'Logistic Regression', 'Neural Network', 'SVM'])
score