In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [21]:
wbc = pd.read_csv('Datasets/wbc.csv')

wbc = wbc.drop(['Unnamed: 32'], axis=1)

X = wbc.loc[:, ['radius_mean', 'concave points_mean']].values
y = wbc.iloc[:, 1]
y = y.replace('M', 1)
y = y.replace('B', 0)
y = y.values


# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1, stratify=y)

## 1) Classification Tree

In [22]:
# Instantiate a DecisionTreeClassifier 'dt' with a maximum depth of 6
dt = DecisionTreeClassifier(max_depth=6, random_state=1)

# Fit dt to the training set
dt.fit(X_train, y_train)

# Predict test set labels
y_pred = dt.predict(X_test)
print(y_pred[0:5])

[0 0 0 1 0]


In [23]:
# Import accuracy_score
from sklearn.metrics import accuracy_score

# Predict test set labels
y_pred = dt.predict(X_test)

# Compute test set accuracy  
acc = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}".format(acc))

Test set accuracy: 0.89


## 2) Logistic regression vs classification tree

In [None]:
# Import LogisticRegression from sklearn.linear_model
from sklearn.linear_model import  LogisticRegression

# Instatiate logreg
logreg = LogisticRegression(random_state=1)

# Fit logreg to the training set
logreg.fit(X_test, y_test)

# Define a list called clfs containing the two classifiers logreg and dt
clfs = [logreg, dt]

# Review the decision regions of the two classifiers
plot_labeled_decision_regions(X_test, y_test, clfs)

<img src="Datasets/images/logreg_vs_treeClassification.svg" alt="Girl in a jacket" width="500" height="600">

#### Notice how the decision boundary produced by logistic regression is linear while the boundaries produced by the classification tree divide the feature space into rectangular regions.

## 3) Entropy vs Gini index

In [29]:
accuracy = []

for i, type_criterion in enumerate(['entropy', 'gini index']):

    # Instantiate dt_entropy, set 'entropy' as the information criterion
    dt = DecisionTreeClassifier(max_depth=8, criterion='entropy', random_state=1)

    # Fit dt_entropy to the training set
    dt.fit(X_train, y_train)

    # Use dt_entropy to predict test set labels
    y_pred= dt.predict(X_test)

    # Evaluate accuracy_entropy
    accuracy.append(accuracy_score(y_test, y_pred))

# Print accuracy_entropy
print('Accuracy achieved by using entropy: ', accuracy[0])

# Print accuracy_gini
print('Accuracy achieved by using the gini index: ', accuracy[1])

Accuracy achieved by using entropy:  0.8859649122807017
Accuracy achieved by using the gini index:  0.8859649122807017


####  Notice how the two models achieve exactly the same accuracy. Most of the time, the gini index and entropy lead to the same results. The gini index is slightly faster to compute and is the default criterion used in the DecisionTreeClassifier model of scikit-learn.

## 4) DecisionTreeRegressor

In [2]:
auto = pd.read_csv('Datasets/auto.csv')

In [3]:
auto.head()

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0
2,36.1,91.0,60,1800,16.4,Asia,10.0
3,18.5,250.0,98,3525,19.0,US,15.0
4,34.3,97.0,78,2188,15.8,Europe,10.0


In [7]:
auto = pd.get_dummies(auto, drop_first=True)
auto.head(1)

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,15.0,0,1


In [13]:
y = auto.mpg.values
X = auto.drop(["mpg"], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)

In [14]:
from sklearn.tree import DecisionTreeRegressor

# Instantiate dt
dt = DecisionTreeRegressor(max_depth=4,
             min_samples_leaf=0.26,
            random_state=3)

# Fit dt to the training set
dt.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.26, random_state=3)

In [15]:
# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute y_pred
y_pred = dt.predict(X_test)

# Compute mse_dt
mse_dt = MSE(y_test, y_pred)

# Compute rmse_dt
rmse_dt = mse_dt**(1/2)

# Print rmse_dt
print("Test set RMSE of dt: {:.2f}".format(rmse_dt))

Test set RMSE of dt: 4.86


In [16]:
from sklearn.model_selection import cross_val_score

# Compute the array containing the 10-folds CV MSEs
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10, 
                                  scoring='neg_mean_squared_error', 
                                  n_jobs=-1) 

# Compute the 10-folds CV RMSE
RMSE_CV = (MSE_CV_scores.mean())**(1/2)

# Print RMSE_CV
print('CV RMSE: {:.2f}'.format(RMSE_CV))

NameError: name 'cross_val_score' is not defined