# Regression with Decision trees

Decision trees are good for regression if there are no outliers!

In [30]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection  import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.metrics import mean_squared_error,r2_score

from math import sqrt


# Estimate car prices

## Import dataset

In [16]:
autos= pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",header=None)

autos.head()
# Column do not have names

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [17]:
autos.dtypes

0       int64
1      object
2      object
3      object
4      object
5      object
6      object
7      object
8      object
9     float64
10    float64
11    float64
12    float64
13      int64
14     object
15     object
16      int64
17     object
18     object
19     object
20    float64
21     object
22     object
23      int64
24      int64
25     object
dtype: object

## Clean up the dataset

We need to remove rows with missing values
We need to change the form of numerical columns (if they were imported as objects)
WE need to apply the label -encoder to do 1-hot encoding

In [18]:
# Give names to columns (bafter referring to the docs of the dataset)
autos.columns=["symboling",
'normalized-losses',
'make',
'fuel-type',
'aspiration',
'num-of-doors',
'body-style',
'drive-wheels',
'engine-location',
'wheel-base',
'length',
'width',
'height',
'curb-weight',
'engine-type',
'num-of-cylinders',
'engine-size',
'fuel-system',
'bore',
'stroke',
'compression-ratio',
'horsepower',
'peak-rpm',
'city-mpg',
'highway-mpg',
'price'
]
autos.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [19]:
# autos[['engine-type','num-of-cylinders','fuel-system','bore','stroke','horsepower']]

# Drop rows with missing values
autos.drop(autos[autos['normalized-losses']=="?"].index,inplace=True)
autos.drop(autos[autos['bore']=='?'].index,inplace=True)

In [20]:
# Change a NUMBER STRING type into INT or FLOAT ( '?' have been deleted so we can do this now!)
autos['normalized-losses']=autos['normalized-losses'].astype("int")
autos['horsepower']=autos['horsepower'].astype("int")
autos['peak-rpm']=autos['peak-rpm'].astype("int")
autos['price']=autos['price'].astype('int')

autos['bore']=autos['bore'].astype("float")
autos['stroke']=autos['stroke'].astype("float")

In [21]:
le = LabelEncoder()
for column in autos.select_dtypes(include='object').columns:
    autos[column]=le.fit_transform(autos[column])

## Train the model

In [24]:
X = autos.drop('price', axis=1)
Y = autos['price']

X_train,X_test,Y_train,Y_test =train_test_split(X,Y,test_size=0.3,random_state=0)


model =DecisionTreeRegressor()
model.fit(X_train,Y_train)

_model_features_names = model.feature_names_in_.tolist()
print(export_text(model,feature_names= _model_features_names))


|--- curb-weight <= 2701.00
|   |--- curb-weight <= 2291.50
|   |   |--- curb-weight <= 2006.00
|   |   |   |--- bore <= 3.05
|   |   |   |   |--- highway-mpg <= 32.50
|   |   |   |   |   |--- value: [5195.00]
|   |   |   |   |--- highway-mpg >  32.50
|   |   |   |   |   |--- highway-mpg <= 39.50
|   |   |   |   |   |   |--- symboling <= 1.50
|   |   |   |   |   |   |   |--- body-style <= 2.50
|   |   |   |   |   |   |   |   |--- num-of-doors <= 1.50
|   |   |   |   |   |   |   |   |   |--- value: [6229.00]
|   |   |   |   |   |   |   |   |--- num-of-doors >  1.50
|   |   |   |   |   |   |   |   |   |--- curb-weight <= 1902.50
|   |   |   |   |   |   |   |   |   |   |--- compression-ratio <= 9.20
|   |   |   |   |   |   |   |   |   |   |   |--- value: [6095.00]
|   |   |   |   |   |   |   |   |   |   |--- compression-ratio >  9.20
|   |   |   |   |   |   |   |   |   |   |   |--- value: [6377.00]
|   |   |   |   |   |   |   |   |   |--- curb-weight >  1902.50
|   |   |   |   |   |   |  

## Evaluate model

In [25]:
Y_pred = model.predict(X_test)
mean_squared_error(Y_test,Y_pred)


3563569.75

In [26]:
r2_score(Y_test,Y_pred)

0.8765452735976588

## Estimate a house price in Boston

## Import dataset

In [33]:
# from sklearn import datasets
# boston = datasets.load_boston()
# boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)

boston_df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv")
# boston_df['target'] = boston.target
boston_df.head()

# CRIM - per capita crime rate by town
# ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
# INDUS - proportion of non-retail business acres per town.
# CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
# NOX - nitric oxides concentration (parts per 10 million)
# RM - average number of rooms per dwelling
# AGE - proportion of owner-occupied units built prior to 1940
# DIS - weighted distances to five Boston employment centres
# RAD - index of accessibility to radial highways
# TAX - full-value property-tax rate per $10,000
# PTRATIO - pupil-teacher ratio by town
# B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# LSTAT - % lower status of the population
# MEDV - Median value of owner-occupied homes in $1000's
# Target = median value to predict based on other features!

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Train the model

In [34]:
X = boston_df.drop('medv', axis=1)
Y = boston_df['medv']

X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.3,random_state=0)

model = DecisionTreeRegressor()
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)


## Model evaluation

In [38]:
sqrt(mean_squared_error(Y_test, Y_pred))

5.238270104167073

In [40]:
r2_score(Y_test, Y_pred)

0.6704580751667765