# SciKit Learn Decision Tree Regressor

# Imports

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import plot_tree, export_text
import matplotlib.pyplot as plt

# Read data

In [2]:
dataset = pd.concat([
    pd.read_csv('../dataset/smog_part1.csv', index_col = 0),
    pd.read_csv('../dataset/smog_part2_manual_fit.csv', index_col = 0)
])

dataset = dataset[dataset['humiditySht'] >= 60]

In [3]:
dataset = dataset[['pm25_x', 'pm25_y', 'temperatureSht', 'humiditySht', 'pressure']]
dataset = dataset[dataset['humiditySht'] >= 60]

dataset.head()

Unnamed: 0,pm25_x,pm25_y,temperatureSht,humiditySht,pressure
0,19.0,13.8,6.1,72,986
1,16.7,14.4,5.9,74,986
2,21.8,15.1,5.7,75,986
3,21.8,18.0,5.7,76,986
4,25.0,19.0,5.5,77,986


# Train and test split

Split data for train and test sets and normalze

In [4]:
Y = dataset['pm25_x'].to_numpy().reshape(-1, 1)
X = dataset.drop('pm25_x', axis = 1).to_numpy()

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True, random_state = 0)

In [6]:
scaler = preprocessing.StandardScaler().fit(x_train)
y_scaler = max(y_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

y_train = y_train / y_scaler
y_test = y_test / y_scaler

# SKLearn Decision Tree Regressor
default values

In [7]:
dtr = DecisionTreeRegressor()

dtr.fit(x_train,y_train)

train_dataset_score = dtr.score(x_train,y_train)
test_dataset_score = dtr.score(x_test, y_test)

print('R2 on train dataset: ', round(train_dataset_score * 100, 2))
print('R2 on test dataset: ', round(test_dataset_score * 100, 2))

R2 on train dataset:  99.96
R2 on test dataset:  79.82


### Best found model

In [15]:
dtr = DecisionTreeRegressor(criterion = 'absolute_error',
                            max_features = 4, 
                            min_samples_leaf = 8,
                            min_samples_split = 4, 
                            splitter = 'best')

dtr.fit(x_train,y_train)

DecisionTreeRegressor(criterion='absolute_error', max_features=4,
                      min_samples_leaf=8, min_samples_split=4)

In [16]:
y_pred = dtr.predict(x_test) * y_scaler

r2 = r2_score(y_test * y_scaler, y_pred) * 100
mse = mean_squared_error(y_test * y_scaler, y_pred)
mae = mean_absolute_error(y_test * y_scaler, y_pred)

print('R2: ', r2)
print('MSE: ',mse)
print('MAE: ',mae)

R2:  83.90808244411004
MSE:  9.50718634376051
MAE:  1.3234106962663976


# step by step model

In [17]:
dtr = DecisionTreeRegressor(criterion='friedman_mse',
                              splitter='best',
                              max_depth=8,
                              min_samples_split=2,
                              min_samples_leaf=2,
                              min_weight_fraction_leaf=0.,
                              max_features=4,
                              max_leaf_nodes=256,
                              min_impurity_decrease=0.,
                              ccp_alpha=0.)

dtr.fit(x_train,y_train)

y_pred = dtr.predict(x_test) * y_scaler

r2 = r2_score(y_test * y_scaler, y_pred) * 100
mse = mean_squared_error(y_test * y_scaler, y_pred)
mae = mean_absolute_error(y_test * y_scaler, y_pred)

print('R2: ', r2)
print('MSE: ',mse)
print('MAE: ',mae)

R2:  83.34494739549804
MSE:  9.839889380876011
MAE:  1.571667066654974
