# *Decision Tree Regressor* 

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

%matplotlib inline

In [4]:
from sklearn.datasets import fetch_california_housing

dataset = fetch_california_housing()

In [5]:
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [6]:
dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [7]:
dataset.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [8]:
dataset.target_names

['MedHouseVal']

In [9]:
dataset.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [10]:
df= pd.DataFrame(data=dataset.data,columns=dataset.feature_names)

In [11]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [12]:
df.shape

(20640, 8)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


# *Seperating the independent and Dependent Variable*

In [14]:
X = df.iloc[:]
y = dataset.target

In [15]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [16]:
y[:]

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

# *Perfom Train Test Split*

In [18]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.33, random_state=24)

In [19]:
Xtrain.shape, Xtest.shape

((13828, 8), (6812, 8))

In [20]:
ytrain.shape , ytest.shape

((13828,), (6812,))

In [21]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

In [22]:
model.fit(Xtrain,ytrain)

In [28]:
ypred = model.predict(Xtest)

In [29]:
ytest

array([2.358, 0.857, 2.74 , ..., 1.31 , 3.585, 2.168])

In [30]:
from sklearn.metrics import r2_score

print(f"R2 Score: {r2_score(ypred,ytest)}")

R2 Score: 0.5971091652739569


In [54]:
parameter = {
    'criterion': ["squared_error", "friedman_mse", "absolute_error","poisson"],
    'splitter': ["best", "random"],
    'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12],
    'max_features': ['auto',"sqrt", "log2"]
}

In [55]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(model, param_grid=parameter, cv=5, scoring='neg_mean_squared_error')

In [56]:
clf.fit(Xtrain,ytrain)

In [61]:
clf.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 10,
 'max_features': 'log2',
 'splitter': 'best'}

In [62]:
y_pred = clf.predict(Xtest)

In [63]:
y_pred

array([3.64683396, 1.74007317, 3.25330769, ..., 2.07662076, 2.18322192,
       2.60123077])

In [64]:
print(f"R2 Score :{r2_score(y_pred,ytest)}")

R2 Score :0.5466225810719627
