# Decision Tree Demo

<hr style='border:1px solid black'>

## Import libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

<hr style='border:1px solid black'>

## Load data
choose your data source by uncommenting one of the sections below

### ...from local file

In [None]:
#data = pd.read_csv('data_phones.csv')

### ...from Azure
(to run this code, you must select **Python 3.XX - SDK v2** kernel

In [None]:
#from azure.ai.ml import MLClient
#from azure.identity import DefaultAzureCredential
#ml_client = MLClient.from_config(credential=DefaultAzureCredential())
#data_asset = ml_client.data.get('data_phones', version='1')
#data = pd.read_csv(data_asset.path)

### ...from Databricks FileStore

In [None]:
#data = pd.read_csv('/dbfs/FileStore/tables/data_phones.csv')

### ...from Databricks FileStore (option 2)

In [None]:
#data = spark.read.csv('/FileStore/tables/data_phones.csv', inferSchema=True, header=True, sep=',')
#data = data.toPandas()

### ...from Databricks FileStore (option 2)

In [None]:
#data = spark.read.format('csv') \
#  .option('inferSchema', 'false') \
#  .option('header', 'true') \
#  .option('sep', ',') \
#  .load('/FileStore/tables/data_phones.csv')
#data = data.toPandas()

### ...from Databricks Catalog

In [None]:
#%sql
#SELECT * FROM `catalog1`.`schema1`.`data_phones`;

In [None]:
#%python
#data = _sqldf.toPandas()

### ...from Databricks Catalog (option 2)

In [None]:
#data = spark.sql('SELECT * FROM catalog1.schema1.data_phones').toPandas()

### ...from Databricks Catalog (option 3)

In [None]:
#data = spark.table('catalog1.schema1.data_phones').toPandas()
#data = spark.read.table('catalog1.schema1.data_phones').toPandas()

### Check if dataset has been loaded

In [None]:
class StopExecution(Exception):
    def _render_traceback_(self):
        return []

if 'data' not in locals():
    print('NO DATA LOADED - USE ONE OF THE SECTIONS ABOVE TO LOAD THE DATASET!')
    raise StopExecution
    
del StopExecution
print('Dataset has been loaded successfully')

<hr style='border:1px solid black'>

## Preview and prepare

### Preview the data

In [None]:
print('Data dimensions:', data.shape)
data = data.dropna()
print('Data dimensions:', data.shape)
data.head()

### Split the dataset into training and testing sets

In [None]:
X = data.drop('price_range', axis=1)
y = data['price_range']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
del data, X, y, train_test_split

### Scale numeric values

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled).reset_index(drop=True)
X_test_scaled  = pd.DataFrame(X_test_scaled).reset_index(drop=True)
del pd, scaler, StandardScaler, X_test, X_train

<hr style='border:1px solid black'>

## Build Decision Tree

In [None]:
model = DecisionTreeClassifier(max_depth=5, random_state=42).fit(X_train_scaled, y_train)
model

<hr style='border:1px solid black'>

## Generate predictions

In [None]:
y_pred = model.predict(X_test_scaled)
del DecisionTreeClassifier
del X_test_scaled, X_train_scaled, y_train

<hr style='border:1px solid black'>

## Visualise the tree...

In [None]:
#
# Useful source
# https://mljar.com/blog/visualize-decision-tree/
#

### ...as a text

In [None]:
print(tree.export_text(model))

### ...as an image

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model, filled=True)
del tree, fig
del model
#feature_names=iris.feature_names,  
#class_names=iris.target_names,               

<hr style='border:1px solid black'>

## Evaluate the model

### Calculate accuracy

In [None]:
y_test_reset = y_test.reset_index(drop=True)
print('Accuracy of the model:', accuracy_score(y_pred, y_test_reset))
del accuracy_score, y_test_reset

### Build confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
del confusion_matrix
cm

### Build confusion matrix heatmap

In [None]:
plt.imshow(cm, cmap='YlGnBu')
plt.xticks([0,1,2,3], fontsize=12)
plt.yticks([0,1,2,3], fontsize=12)
plt.xlabel('Actual Class', fontsize=16)
plt.ylabel('Predicted Class', fontsize=16)
plt.title('Confusion Matrix', fontsize=18)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        if (i == j):
            colour = 'white'
        else:
            colour = 'brown'
        plt.text(i, j, cm[j,i],
                 ha='center', va='center',
                 fontsize=16, color=colour)
plt.colorbar()
plt.show()
del plt, i, j, colour
del cm
#
# seaborn heatmap:
# sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu')

### Classification Report

In [None]:
print(classification_report(y_test, y_pred))
del classification_report
del y_test, y_pred

<hr style='border:1px solid black'>

## Conclusion

In [None]:
#
# Not only you executed this notebook in the cloud -
# but you also loaded it (and its data) from the cloud
#

<hr style='border:1px solid black'>

## Advanced Task 1 (OPTIONAL)

In [None]:
#
# Find out if class 0 is the cheapest or the most
# expensive one (by using statistical measures)
#

## Advanced Task 2 (OPTIONAL)

In [None]:
#
# Downsample dataset to have exactly the same number
# of training data points in each price band, and see
# if it that improves model accuracy
#