In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *

np.random.seed(42)

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
data = pd.read_csv(url, delim_whitespace=True, header=None,
                 names=["mpg", "cylinders", "displacement", "horsepower", "weight",
                        "acceleration", "model year", "origin", "car name"])

data

  data = pd.read_csv(url, delim_whitespace=True, header=None,


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger


In [3]:
print(data.dtypes)

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object


In [4]:
data['horsepower'].isnull().sum()

0

In [5]:
data['horsepower'].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [6]:
data=data[data['horsepower'] != '?']
data['horsepower']=data['horsepower'].astype('float')
print(data.shape)
print(data.dtypes)

(392, 9)
mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight          float64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['horsepower']=data['horsepower'].astype('float')


In [7]:
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [15]:
X=data.iloc[:,1:8].reset_index(drop=True)
y=data['mpg'].reset_index(drop=True)


In [16]:
for criteria in ["information_gain", "gini_index"]:
    tree = DecisionTree(criterion=criteria)
    tree.fit(X, y)
    y_hat = tree.predict(X)
    y_hat = pd.Series(y_hat)
    tree.plot()
    # print(y_hat,"\n",y)
    print("Criteria :", criteria)
    print("RMSE: ", rmse(y_hat, y))
    print("MAE: ", mae(y_hat, y))
    print("Accuracy: ", accuracy(y_hat, y))



<<<<<<<<<<<<<<<- START OF TREE ->>>>>>>>>>>>>>

root -> ?(displacement > 190.50)
├────Yes -> ?(horsepower > 125.00)
│    ├────Yes -> ?(model year > 76.00)
│    │    ├────Yes -> ?(weight > 3947.50)
│    │    │    ├────Yes -> ?(model year > 77.00)
│    │    │    │    ├────Yes -> ?(displacement > 350.50)
│    │    │    │    │    ├─── Yes: 16.00
│    │    │    │    │    └─── No: 17.20
│    │    │    │    └────No -> ?(displacement > 310.00)
│    │    │    │    │    ├─── Yes: 15.70
│    │    │    │    │    └─── No: 15.00
│    │    │    └────No -> ?(acceleration > 13.20)
│    │    │    │    ├────Yes -> ?(weight > 3835.00)
│    │    │    │    │    ├─── Yes: 17.00
│    │    │    │    │    └─── No: 17.83
│    │    │    │    └────No -> ?(acceleration > 12.65)
│    │    │    │    │    ├─── Yes: 19.32
│    │    │    │    │    └─── No: 17.80
│    │    └────No -> ?(weight > 4358.50)
│    │    │    ├────Yes -> ?(model year > 74.00)
│    │    │    │    ├────Yes -> ?(horsepower > 149.00)
│    │    │   

In [17]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [18]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=42)
model=tree.DecisionTreeRegressor(max_depth=5)
model.fit(X_train,y_train)
print("Training accuracy =", model.score(X_train,y_train))

Training accuracy = 0.9379113806788827


In [19]:
y_hat=model.predict(X_test)
print("Testing accuracy =", model.score(X_test,y_test))
print("RMSE =", np.sqrt(mean_squared_error(y_hat,y_test)))
print("MAE =", mean_absolute_error(y_hat,y_test))

Testing accuracy = 0.8094429117177016
RMSE = 3.1751205206902453
MAE = 2.2961593623269154


In [20]:
print(tree.export_text(model, feature_names=X.columns))

|--- displacement <= 198.50
|   |--- horsepower <= 84.50
|   |   |--- model year <= 76.50
|   |   |   |--- weight <= 2099.50
|   |   |   |   |--- origin <= 2.50
|   |   |   |   |   |--- value: [29.42]
|   |   |   |   |--- origin >  2.50
|   |   |   |   |   |--- value: [32.22]
|   |   |   |--- weight >  2099.50
|   |   |   |   |--- acceleration <= 16.75
|   |   |   |   |   |--- value: [26.44]
|   |   |   |   |--- acceleration >  16.75
|   |   |   |   |   |--- value: [24.08]
|   |   |--- model year >  76.50
|   |   |   |--- displacement <= 94.00
|   |   |   |   |--- weight <= 2077.50
|   |   |   |   |   |--- value: [36.54]
|   |   |   |   |--- weight >  2077.50
|   |   |   |   |   |--- value: [43.78]
|   |   |   |--- displacement >  94.00
|   |   |   |   |--- weight <= 2375.00
|   |   |   |   |   |--- value: [33.35]
|   |   |   |   |--- weight >  2375.00
|   |   |   |   |   |--- value: [29.24]
|   |--- horsepower >  84.50
|   |   |--- model year <= 78.50
|   |   |   |--- weight <= 2781.0