# Decision Trees


## Tasks

### Task 1

Import data you need to solve tasks for this lesson. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 500

In [2]:
data = pd.read_csv('autos.csv')

In [3]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
data.shape

(4340, 8)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [6]:
data['selling_price'].describe()

count    4.340000e+03
mean     5.041273e+05
std      5.785487e+05
min      2.000000e+04
25%      2.087498e+05
50%      3.500000e+05
75%      6.000000e+05
max      8.900000e+06
Name: selling_price, dtype: float64

### Task 2

Split data on train and test samples. 

In [7]:
X = data.drop('selling_price', axis=1)
y = data['selling_price']

In [8]:
# we will use MSLE
y = y.apply(np.log1p)

In [9]:
y.describe()

count    4340.000000
mean       12.764188
std         0.839262
min         9.903538
25%        12.248842
50%        12.765691
75%        13.304687
max        16.001562
Name: selling_price, dtype: float64

In [10]:
from sklearn.model_selection import train_test_split 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Task 3 

Create a Mean Target Encoder transformer. 

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin


class MeanTargetEncoderNoise(BaseEstimator, TransformerMixin):    
    def __init__(self, categorical, numeric, target='selling_price'):
        self.categorical = categorical
        self.numeric = numeric
        self.target = target
        
        
    def fit(self, X, y):
        X_fit = X.copy()
        y_fit = y.copy()
        
        X_target = pd.concat((X_fit, y_fit), axis=1)
        
        self.mte = {col: X_target.groupby(col)[self.target].mean() + 0.006 * np.random.normal(0, 1, size = 
                                                       X_target.groupby(col)[self.target].mean().shape[0]) for col in self.categorical}
        self.mte_cols = self.mte.keys()
        
        return self
        
        
    def transform(self, df):
        X_cp = df.copy()
        
        for col in self.categorical:
            X_cp[col] = X_cp[col].map(self.mte[col])
            X_cp[col] = X_cp[col].fillna(0)
        
        return X_cp

### Task 4

Transform the data using your own custom Mean Target Encoder transformer.

In [12]:
object_cols = ['name', 'year', 'fuel', 'seller_type', 'transmission', 'owner']
num_cols = ['km_driven']

In [13]:
np.random.seed(1)

transformer = MeanTargetEncoderNoise(categorical=object_cols, numeric=num_cols)
transformer.fit(X_train, y_train)

train = transformer.transform(X_train)
test = transformer.transform(X_test)

train.head(10)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner
3294,13.483692,13.436559,50000,13.093756,12.615863,13.771135,12.97773
2290,12.117029,11.903115,70000,12.453832,12.615863,13.771135,12.97773
874,12.310456,13.328864,50000,12.453832,12.615863,12.639805,12.97773
1907,12.491443,13.042359,92198,12.453832,13.152824,12.639805,12.463313
3244,12.390906,12.870886,3240,12.453832,12.615863,12.639805,12.463313
1089,12.687432,13.436559,10000,12.453832,13.152824,12.639805,12.97773
3902,11.698702,11.503504,90000,12.453832,12.615863,12.639805,11.87839
2215,11.120678,11.503504,79000,12.453832,12.615863,12.639805,12.463313
3862,13.172879,13.328864,99700,13.093756,12.615863,12.639805,12.97773
705,13.004194,12.241213,124000,13.093756,12.615863,12.639805,12.463313


In [14]:
pd.concat([X_train[['km_driven']], train.drop(['km_driven'], axis=1)], axis=1) \
    .reset_index(drop=True).head(10).to_csv('mte_results.csv', sep=';', index=False)

### Task 5

Find best MSLE for `max_depth_list` hyperparameter. 

In [15]:
max_depth_list = [3, 5, 8, 12]

In [16]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline


np.random.seed(1)

score_list_depth = []
for depth in max_depth_list:
    pipe = Pipeline([
        ('transformer', MeanTargetEncoderNoise(categorical=object_cols, numeric=num_cols)),
        ('dt', DecisionTreeRegressor(max_depth=depth))
                      ])
    pipe.fit(X_train, y_train)
    score_list_depth.append(np.mean((pipe.predict(X_test) - y_test) ** 2))

In [17]:
min(score_list_depth)

0.7978271003064052

### Task 6

Find best MSLE for `min_samples_split_list` hyperparameter. 

In [18]:
min_samples_split_list = [10, 50, 100, 500]

In [19]:
np.random.seed(1)

score_list_samples = []
for samples in min_samples_split_list:
    pipe = Pipeline([
        ('transformer', MeanTargetEncoderNoise(categorical=object_cols, numeric=num_cols)),
        ('dt', DecisionTreeRegressor(min_samples_split=samples))
                      ])
    pipe.fit(X_train, y_train)
    score_list_samples.append(np.mean((pipe.predict(X_test) - y_test) ** 2))

In [20]:
min(score_list_samples)

0.8077891374012252

### Task 7

Find best MSLE for `min_impurity_decrease_list` hyperparameter. 

In [21]:
min_impurity_decrease_list = [0, 0.1, 0.15, 0.2]

In [22]:
np.random.seed(1)

score_list_impurity = []
for impurity in min_impurity_decrease_list:
    pipe = Pipeline([
        ('transformer', MeanTargetEncoderNoise(categorical=object_cols, numeric=num_cols)),
        ('dt', DecisionTreeRegressor(min_impurity_decrease=impurity))
                      ])
    pipe.fit(X_train, y_train)
    score_list_impurity.append(np.mean((pipe.predict(X_test) - y_test) ** 2))

In [23]:
min(score_list_impurity)

0.5189515041093508

### Task 8

Find best MSLE for `max_leaf_nodes_list` hyperparameter. 

In [24]:
max_leaf_nodes_list = [100, 200, 500]

In [25]:
np.random.seed(1)

score_list_leaf = []
for leaf in max_leaf_nodes_list:
    pipe = Pipeline([
        ('transformer', MeanTargetEncoderNoise(categorical=object_cols, numeric=num_cols)),
        ('dt', DecisionTreeRegressor(max_leaf_nodes=leaf))
                      ])
    pipe.fit(X_train, y_train)
    score_list_leaf.append(np.mean((pipe.predict(X_test) - y_test) ** 2))

In [26]:
min(score_list_leaf)

1.9813178457505742

### Task 9

Find the best combination of hyperparameters using `GridSearchCV`.

In [27]:
pipe.get_params()

{'memory': None,
 'steps': [('transformer',
   MeanTargetEncoderNoise(categorical=['name', 'year', 'fuel', 'seller_type',
                                       'transmission', 'owner'],
                          numeric=['km_driven'])),
  ('dt', DecisionTreeRegressor(max_leaf_nodes=500))],
 'verbose': False,
 'transformer': MeanTargetEncoderNoise(categorical=['name', 'year', 'fuel', 'seller_type',
                                     'transmission', 'owner'],
                        numeric=['km_driven']),
 'dt': DecisionTreeRegressor(max_leaf_nodes=500),
 'transformer__categorical': ['name',
  'year',
  'fuel',
  'seller_type',
  'transmission',
  'owner'],
 'transformer__numeric': ['km_driven'],
 'transformer__target': 'selling_price',
 'dt__ccp_alpha': 0.0,
 'dt__criterion': 'squared_error',
 'dt__max_depth': None,
 'dt__max_features': None,
 'dt__max_leaf_nodes': 500,
 'dt__min_impurity_decrease': 0.0,
 'dt__min_samples_leaf': 1,
 'dt__min_samples_split': 2,
 'dt__min_weight_fract

In [28]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'dt__max_depth': [3, 5, 8, 12],
    'dt__min_samples_split': [10, 50, 100, 500],
    'dt__min_impurity_decrease': [0, 0.1, 0.15, 0.2],
    'dt__max_leaf_nodes': [100, 200, 500]
}

np.random.seed(1)
pipe = Pipeline([('transformer', MeanTargetEncoderNoise(categorical=object_cols, numeric=num_cols)),
                 ('dt', DecisionTreeRegressor())])

search = GridSearchCV(pipe, param_grid, scoring='neg_mean_squared_error')

search.fit(X_train, y_train)

In [29]:
search.best_params_

{'dt__max_depth': 12,
 'dt__max_leaf_nodes': 200,
 'dt__min_impurity_decrease': 0.1,
 'dt__min_samples_split': 100}

In [30]:
search.score(X_test, y_test)

-0.5200098587235225