In [51]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mutual_info_score, accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# Data preprocessing

In [54]:
df = pd.read_csv("Data/housing.csv")
df.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY


In [53]:
# Keep only  '<1H OCEAN' and 'INLAND' from ocean_proximity
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

# Fill missing values
print('Missing values\n',df.isnull().sum()[4:5])
df.fillna(0, inplace = True) 

# Log transform to median_house_value
df['median_house_value'] = df['median_house_value'].agg(np.log1p)

# Train/Validation/Test split
df_train_large, df_test = train_test_split(df, test_size = 0.2, random_state = 1)
df_train, df_val = train_test_split(df_train_large, train_size = 0.75, random_state = 1)

# Convert DataFrames to dictionary records
train_dict = df_train.to_dict(orient='records')
train_large_dict = df_train_large.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')

dv = DictVectorizer(sparse=True)

# Fit and transform
X_train = dv.fit_transform(train_dict)
X_train_large = dv.transform(train_large_dict)
X_val = dv.transform(val_dict)
X_test = dv.transform(test_dict)


Missing values
 total_bedrooms    157
dtype: int64


# Decision Trees

Consider a hypothetical dataset with multiple features $\mathbf{X}_{1},\cdots, \mathbf{X}_{d}$ and a target variable $\mathbf{Y}$ as shown:

$$
\left( \begin{array}{c|ccc|c}
\text{Instance}    &\mathbf{X}_{1}&\cdots & \mathbf{X}_{d}  & \mathbf{Y}\\
\hline
\mathbf{x}_{1} & x_{11}& \cdots&x_{1d}&y_1 \\
\vdots&\vdots&\ddots&\vdots&\vdots\\
\mathbf{x}_{n}&x_{n1}&\cdots&x_{nd}&y_n
\end{array} \right).$$

Here, each row vector is an instance $ \mathbf{x}_i = ( x_{i1}, \ldots, x_{id}) $ of the dataset with $ d$ values. The dataset is separated in a feature matrix $\mathbf{X}$ and a target vector $\mathbf{Y}$:

$$\mathbf{X}=
\left( \begin{array}{ccc}
  x_{11}& \cdots&x_{1d} \\
\vdots&\ddots&\vdots&\\
x_{n1}&\cdots&x_{nd}
\end{array} \right) ~~~ \text{and} ~~~ 

\mathbf{Y} = \left( \begin{array}{c}
y_1\\
\vdots\\
y_n
\end{array} \right)
$$

The target vector $\mathbf{Y}$ consists of non-binary values. Each element, $ y_i $, in $\mathbf{Y}$ stands for one of the potential $k$ class labels, which is captured by the set $ \mathcal{Y} = \{c_1, c_2, \ldots, c_k\} $. These unique classes in $\mathcal{Y}$ have corresponding partition of the dataset, by grouping instances $\mathbf{x}_i$ by their class label $c_j$. 

A decision tree is a recursive model that employs partition-based methods to predict the class $\hat{y}_i$ for each instance $\mathbf{x}_i$. The process starts by splitting the dataset into two partitions. These partitions are further divided recursively, until achieve a state where the majority of instances $\mathbf{x}_i$ within a partition belong to the same class label $c_j$.

One important partition-based method employed in most decision tree model, like CART, is the axis-parallel hyperplane. This approach is commonly used in high-dimensional spaces represented by the dataset's features. The term "hyperplane" refers to the generalization of a geometric plane in a space with more than three dimensions.

The mathematical formulation for such a hyperplane is given by the condition:

$$h(\mathbf{x}) = \mathbf{x} \cdot \mathbf{w} + b\leq 0$$

Here, $\mathbf{x}$ can be any instance from the dataset. The weight vector $\mathbf{w}$ is restricted a priori to one of the standard basis vectors $\{\mathbf{e}_1,\cdots,\mathbf{e}_j,\cdots \mathbf{e}_d\}$, where $\mathbf{e}_j$ has a value of 1 for the jth dimension and 0 for all other dimensions. This implies that the weights determine the orientation of the hyperplane in one of the basis vector directions, while the bias term translates it along that axis. The base vectors of a $d$-dimensional space are given by:


$$ 
\mathbf{e}_1 = \left( \begin{array}{c}
1\\
\vdots\\
0\\
\vdots\\
0
\end{array} \right),~
\mathbf{e}_i = \left( \begin{array}{c}
0\\
\vdots\\
1\\
\vdots\\
0
\end{array} \right),~~
\mathbf{e}_d = \left( \begin{array}{c}
0\\
\vdots\\
0\\
\vdots\\
1
\end{array} \right)
$$


On the other hand, the inequality $h(\mathbf{x}) \leq 0$ serves a particular purpose: it defines a half-space. Any instance $\mathbf{x}$ for which $h(\mathbf{x}) \leq 0$ lies on one side of the hyperplane, and any instance for which $h(\mathbf{x}) > 0$ lies on the other side. In this way, the hyperplane acts as a decision boundary that partitions the dataset into two partitions based on the sign of $h(\mathbf{x})$.

For a given standard basis vector chosen as the weight $\mathbf{w} = \mathbf{e}_j$, the decision condition $h(\mathbf{x})$ for some instance $\mathbf{x}_i$ is represented by:

$$h(\mathbf{x}_i) = \mathbf{e}_j \cdot \mathbf{x}_i  + b \leq 0$$

which simplifies to:

$$x_{ij} \leq v$$

Where $v = -b$ is a specific value within the domain of the feature vector $\mathbf{X}_i$.The split condition for the ith feature will be then the value of the jth element from the row vector $\mathbf{X}_i$. The optimal offset $b$ is chosen to minimize a particular criterion, such as a loss function, for the partitioned datasets.

Upon applying this decision boundary, the dataset $\mathcal{D}$ is split into two distinct partitions: $\mathcal{D}_Y$ and $\mathcal{D}_N$. In this partitioning, $\mathcal{D}_Y$ consists of those instances that satisfy the decision boundary $x_{ij} \leq v$, while $\mathcal{D}_N$ comprises those that do not satisfy the decision boundary. Thus, for each instance $\mathbf{x}_i$:

If it meets the condition $\mathbf{x}_i \leq  v$

$$\mathcal{D}_Y = \{\mathbf{x}_i| x_{ij} \leq  v\}$$

otherwise,

$$\mathcal{D}_N = \{\mathbf{x}_i| x_{ij} >  v\ \}$$

Here, $v$ is a chosen threshold that delineates the two partitions. All instances $\mathbf{x}_i $ for which the $j$-th feature $x_{ij} $ is less than or equal to $v $ will be allocated to the partition $\mathcal{D}_Y$. In contrast, those for which $x_{ij} > v $ will be grouped into $\mathcal{D}_N$.