In this exercise you will build a decision tree model for classification

In [1]:
import os
import tarfile
from six.moves import urllib
import numpy as np

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data()

In [3]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()

### Fix the categories in the categorical variable

In [5]:
d = {'<1H OCEAN':'LESS_1H_OCEAN', 'INLAND':'INLAND', 'ISLAND':'ISLAND', 'NEAR BAY':'NEAR_BAY', 'NEAR OCEAN':'NEAR_OCEAN'}
housing['ocean_proximity'] = housing['ocean_proximity'].map(lambda s: d[s])

### Add 2 more features

In [6]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["population_per_household"]=housing["population"]/housing["households"]

### Fix missing data

In [7]:
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace=True) 

### Create dummy variables based on the categorical variable

In [8]:
one_hot = pd.get_dummies(housing['ocean_proximity'])
housing = housing.drop('ocean_proximity', axis=1)
housing = housing.join(one_hot)

In [9]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 16 columns):
longitude                   20640 non-null float64
latitude                    20640 non-null float64
housing_median_age          20640 non-null float64
total_rooms                 20640 non-null float64
total_bedrooms              20640 non-null float64
population                  20640 non-null float64
households                  20640 non-null float64
median_income               20640 non-null float64
median_house_value          20640 non-null float64
rooms_per_household         20640 non-null float64
population_per_household    20640 non-null float64
INLAND                      20640 non-null uint8
ISLAND                      20640 non-null uint8
LESS_1H_OCEAN               20640 non-null uint8
NEAR_BAY                    20640 non-null uint8
NEAR_OCEAN                  20640 non-null uint8
dtypes: float64(11), uint8(5)
memory usage: 1.8 MB


In [10]:
housing.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,INLAND,ISLAND,LESS_1H_OCEAN,NEAR_BAY,NEAR_OCEAN
20410,-118.86,34.19,26.0,3135.0,480.0,1474.0,458.0,6.1949,243500.0,6.844978,3.218341,0,0,1,0,0
16490,-120.97,38.0,27.0,1683.0,288.0,873.0,258.0,4.7069,176900.0,6.523256,3.383721,1,0,0,0,0
14137,-117.05,32.74,34.0,2178.0,455.0,1193.0,446.0,3.1719,115300.0,4.883408,2.674888,0,0,0,0,1
4880,-118.25,34.03,52.0,1274.0,418.0,1655.0,368.0,2.1905,124000.0,3.461957,4.497283,0,0,1,0,0
20321,-119.16,34.23,26.0,5444.0,1293.0,3700.0,1158.0,2.7556,213200.0,4.701209,3.195164,0,0,0,0,1


### Partition into train and test

Use train_test_split from sklearn.model_selection to partition the dataset into 70% for training and 30% for testing.

You can use the 70% for training set as both training and validation by using cross-validation.

In [11]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.3, random_state=42)

### Features

In [12]:
target = 'median_house_value'
features = list(train_set.columns)
features = [f for f in features if f!=target]

In [13]:
X_tr = train_set[features]
y_tr = train_set[[target]]

X_te = test_set[features]
y_te = test_set[[target]]

In [14]:
y_tr_b = 1*np.ravel(y_tr>=179700.0)
y_te_b = 1*np.ravel(y_te>=179700.0)
print(y_tr_b)

[1 0 1 ... 1 1 1]


#### 1) Use grid search with cross-validation (with the help of the GridSearchCV class) to find good hyperparameter values (for parameters 'max_leaf_nodes', 'min_samples_split', 'max_depth') for a DecisionTreeClassifier. You should get around 87% average accuracy across the n-folds.

### Use CV=10 in GridSearchDV

In [15]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

dclass = DecisionTreeClassifier()


param_grid = {"max_depth": range(1,10), "min_samples_split": range(2, 10),
              "max_leaf_nodes": range(2,49)}

dclass_cv = GridSearchCV(dclass, param_grid, cv=10)
print(dclass)
dclass_cv.fit(X_tr, y_tr_b)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': range(1, 10),
                         'max_leaf_nodes': range(2, 49),
                         '

In [16]:
print("Best hyperparameter values: {}".format(dclass_cv.best_params_))
print("Best score is {}".format(dclass_cv.best_score_))

Best hyperparameter values: {'max_depth': 9, 'max_leaf_nodes': 48, 'min_samples_split': 2}
Best score is 0.8610880398671097


In [17]:
best_score = dclass_cv.best_estimator_
best_score

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=48,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [18]:
from sklearn.metrics import accuracy_score

y_pred = dclass_cv.predict(X_tr)
accuracy_score(y_tr_b, y_pred)

0.8761074197120708

#### 2) Measure the performance of your best model on the test set

In [19]:
dclass_test = dclass_cv.fit(X_te,y_te_b)
dclass_test

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': range(1, 10),
                         'max_leaf_nodes': range(2, 49),
                         '

In [20]:
print("Best hyperparameter values: {}".format(dclass_test.best_params_))
print("Best score : {}".format(dclass_test.best_score_))

Best hyperparameter values: {'max_depth': 8, 'max_leaf_nodes': 26, 'min_samples_split': 2}
Best score : 0.8488372093023255


In [21]:
from sklearn.metrics import accuracy_score

y_pred_te = dclass_test.predict(X_te)
accuracy_score(y_te_b, y_pred_te)

0.8670865633074936

In [22]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_te_b, y_pred_te))
print(classification_report(y_te_b, y_pred_te))

[[2684  384]
 [ 439 2685]]
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      3068
           1       0.87      0.86      0.87      3124

    accuracy                           0.87      6192
   macro avg       0.87      0.87      0.87      6192
weighted avg       0.87      0.87      0.87      6192



Average Accuracy across n folds is 0.876. 
The best hyperparameter values are : 
    'max_depth': 9, 
    'max_leaf_nodes': 48,
    'min_samples_split': 2

 we have measured the performance of our best model using gridsearch on the test set and from the confusion matrix the accuracy is shown that it has 87%.

### Submit your notebook

Submit your solution on Canvas