# Decision Tree Learning

### Member: 
- [Febriawan Ghally Ar Rahman (1359111)](https://github.com/ghallyy)
- [Mgs. Tabrani (13519122)](https://github.com/mgstabrani)

### Content
1. [DecisionTreeClassifier](http://scikit-learn.org/stable/modules/tree.html)
2. [Id3Estimator](https://github.com/svaante/decision-tree-id3)
3. [K Means](https://scikit-learn.org/0.19/modules/generated/sklearn.cluster.KMeans.html)
4. [LogisticRegression](https://scikitlearn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
5. [Neural_network](https://scikitlearn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)
6. [SVM](https://scikitlearn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)

## Load Datasets

In [674]:
import pandas as pd
from sklearn import datasets
import numpy as np

### Load breast cancer dataset

In [675]:
breast_cancer = datasets.load_breast_cancer()
X_breast_cancer, y_breast_cancer = datasets.load_breast_cancer(return_X_y=True)

# Display breast cancer dataframe
feature_names = list(breast_cancer['feature_names'])
feature_names.append('diagosis')
df_breast_cancer = pd.DataFrame(data= np.c_[breast_cancer['data'], breast_cancer['target']], columns= feature_names)
df_breast_cancer

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,diagosis
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0.0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0.0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0.0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0.0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0.0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0.0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0.0


### Load play tennis dataset

In [676]:
df_play_tennis = pd.read_csv('data/play_tennis.csv')
df_play_tennis = df_play_tennis.drop(['day'],axis=1)

# Display play tennis dataframe
df_play_tennis

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


## Encode Categorical Data

In [677]:
# Encode categorical data in play tennis dataframe
from sklearn import preprocessing
df_play_tennis = df_play_tennis.apply(preprocessing.LabelEncoder().fit_transform)

# Divide play tennis dataframe to data and target
dataset_play_tennis = df_play_tennis.to_numpy()
X_play_tennis = []
y_play_tennis = []
for i in range(len(dataset_play_tennis)):
    X_play_tennis.append(dataset_play_tennis[i][:-1])
    y_play_tennis.append(dataset_play_tennis[i][-1])

# Display encoded dataframe
df_play_tennis

Unnamed: 0,outlook,temp,humidity,wind,play
0,2,1,0,1,0
1,2,1,0,0,0
2,0,1,0,1,1
3,1,2,0,1,1
4,1,0,1,1,1
5,1,0,1,0,0
6,0,0,1,0,1
7,2,2,0,1,0
8,2,0,1,1,1
9,1,2,1,1,1


## Split Datasets

In [678]:
# Split dataset to 80% training data and 20% testing data
from sklearn.model_selection import train_test_split

# Split breast cancer dataset
X_training_breast_cancer, X_testing_breast_cancer = train_test_split(X_breast_cancer, test_size=0.2, random_state=25)
y_training_breast_cancer, y_testing_breast_cancer = train_test_split(y_breast_cancer, test_size=0.2, random_state=25)

# Split play tennis dataset
X_training_play_tennis, X_testing_play_tennis = train_test_split(X_play_tennis, test_size=0.2, random_state=25)
y_training_play_tennis, y_testing_play_tennis = train_test_split(y_play_tennis, test_size=0.2, random_state=25)

## Learning with Logistic Regression Algorithm

In [679]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Breast cancer

In [680]:
clf_breast_cancer = LogisticRegression(random_state=0, max_iter=10000).fit(X_training_breast_cancer, y_training_breast_cancer)
y_predict = clf_breast_cancer.predict(X_testing_breast_cancer)

#### Metrics Evaluation

In [681]:
print(metrics.classification_report(y_testing_breast_cancer, y_predict))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90        39
           1       0.95      0.95      0.95        75

    accuracy                           0.93       114
   macro avg       0.92      0.92      0.92       114
weighted avg       0.93      0.93      0.93       114



### Play tennis

In [682]:
clf_play_tennis = LogisticRegression().fit(X_training_play_tennis, y_training_play_tennis)
y_predict = clf_play_tennis.predict(X_testing_play_tennis)


#### Metrics Evaluation

In [683]:
print(metrics.classification_report(y_testing_play_tennis, y_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



## Learning with Neural Network Algorithm

In [684]:
from sklearn.neural_network import MLPClassifier

### Breast cancer

In [685]:
clf_breast_cancer = MLPClassifier(random_state=1, max_iter=300).fit(X_training_breast_cancer, y_training_breast_cancer)
y_predict = clf_breast_cancer.predict(X_testing_breast_cancer)

#### Metrics Evaluation

In [686]:
print(metrics.classification_report(y_testing_breast_cancer, y_predict))

              precision    recall  f1-score   support

           0       0.97      0.90      0.93        39
           1       0.95      0.99      0.97        75

    accuracy                           0.96       114
   macro avg       0.96      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



### Play tennis

In [687]:
clf_play_tennis = MLPClassifier(random_state=1, max_iter=1000).fit(X_training_play_tennis, y_training_play_tennis)
y_predict = clf_play_tennis.predict(X_testing_play_tennis)

#### Metrics Evaluation

In [688]:
print(metrics.classification_report(y_testing_play_tennis, y_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



## Learning with SVM algorithm

In [689]:
from sklearn.svm import SVC

### Breast cancer

In [690]:
clf_play_tennis = SVC().fit(X_training_breast_cancer, y_training_breast_cancer)
y_predict = clf_play_tennis.predict(X_testing_breast_cancer)

#### Metrics Evaluation

In [691]:
print(metrics.classification_report(y_testing_breast_cancer, y_predict))

              precision    recall  f1-score   support

           0       0.97      0.77      0.86        39
           1       0.89      0.99      0.94        75

    accuracy                           0.91       114
   macro avg       0.93      0.88      0.90       114
weighted avg       0.92      0.91      0.91       114



### Play tennis

In [692]:
clf_play_tennis = SVC().fit(X_training_play_tennis, y_training_play_tennis)
y_predict = clf_play_tennis.predict(X_testing_play_tennis)

#### Metrics Evaluation

In [693]:
print(metrics.classification_report(y_testing_play_tennis, y_predict))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

