In [25]:
import numpy as np
import pandas as pd

import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [18]:
df = pd.read_csv('auto-mpg.csv')

In [20]:
#change measures to liters/100km

df['liters_per_100km'] = (235.215 / df['mpg']).round(2)
df.drop('mpg', axis = 1, inplace = True)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cylinders         398 non-null    int64  
 1   displacement      398 non-null    float64
 2   horsepower        398 non-null    object 
 3   weight            398 non-null    int64  
 4   acceleration      398 non-null    float64
 5   model year        398 non-null    int64  
 6   origin            398 non-null    int64  
 7   car name          398 non-null    object 
 8   liters_per_100km  398 non-null    float64
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [26]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cylinders,398.0,5.454774,1.701004,3.0,4.0,4.0,8.0,8.0
displacement,398.0,193.425879,104.269838,68.0,104.25,148.5,262.0,455.0
weight,398.0,2970.424623,846.841774,1613.0,2223.75,2803.5,3608.0,5140.0
acceleration,398.0,15.56809,2.757689,8.0,13.825,15.5,17.175,24.8
model year,398.0,76.01005,3.697627,70.0,73.0,76.0,79.0,82.0
origin,398.0,1.572864,0.802055,1.0,1.0,1.0,2.0,3.0
liters_per_100km,398.0,11.212965,3.901596,5.05,8.11,10.23,13.44,26.14


In [27]:
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,liters_per_100km
0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,13.07
1,8,350.0,165,3693,11.5,70,1,buick skylark 320,15.68
2,8,318.0,150,3436,11.0,70,1,plymouth satellite,13.07
3,8,304.0,150,3433,12.0,70,1,amc rebel sst,14.70
4,8,302.0,140,3449,10.5,70,1,ford torino,13.84
...,...,...,...,...,...,...,...,...,...
393,4,140.0,86,2790,15.6,82,1,ford mustang gl,8.71
394,4,97.0,52,2130,24.6,82,2,vw pickup,5.35
395,4,135.0,84,2295,11.6,82,1,dodge rampage,7.35
396,4,120.0,79,2625,18.6,82,1,ford ranger,8.40


# Preprocessing

In [40]:
def onehot_encode(df, column_dict):
    df = df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [44]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Fill in missing horsepower values with the column mean
    df['horsepower'] = df['horsepower'].replace('?', np.NaN).astype(np.float)
    df['horsepower'] = df['horsepower'].fillna(df['horsepower'].mean())
    
     # Create make feature - take the first word before space from the car column
    df['make'] = df['car name'].apply(lambda x: re.search(r'^\w+', x).group(0))
    df = df.drop('car name', axis=1)
    
      # Fix typos in make names
    make_typo_correction = {
        'vw': 'volkswagen',
        'chevy': 'chevrolet',
        'maxda': 'mazda',
        'vokswagen': 'volkswagen',
        'toyouta': 'toyota',
        'chevroelt': 'chevrolet'
    }
    df['make'] = df['make'].replace(make_typo_correction)
    
    # One-hot encode nominal features
    nominal_feature_dict = {
        'origin': 'orig',
        'make': 'mk'
    }
    df = onehot_encode(df, nominal_feature_dict)
    
    # Split df into X and y
    y = df['liters_per_100km'].copy()
    X = df.drop('liters_per_100km', axis=1).copy()
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    # Scale X_train and X_test with a standard scaler fit only on X_train
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [45]:
X_train, X_test, y_train, y_test = preprocess_inputs(df)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df['horsepower'] = df['horsepower'].replace('?', np.NaN).astype(np.float)


In [46]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,orig_1,orig_2,orig_3,mk_amc,...,mk_peugeot,mk_plymouth,mk_pontiac,mk_renault,mk_saab,mk_subaru,mk_toyota,mk_triumph,mk_volkswagen,mk_volvo
0,1.497785,1.620399,1.711111,1.968967,-0.958513,-0.827971,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,3.591657,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
1,1.497785,2.288862,2.446571,1.601619,-2.072349,-1.640357,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
2,-0.844742,-0.714375,-0.337673,-0.636509,-0.215956,1.067595,-1.313579,-0.427760,1.926620,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
3,0.326521,0.312538,0.135123,0.755426,1.157775,0.526005,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
4,-0.844742,-0.927508,-1.545930,-1.339518,1.826077,-1.640357,-1.313579,2.337759,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,4.342481,-0.135333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,1.497785,1.523521,1.711111,1.395058,-1.552559,0.255210,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
274,0.326521,0.554735,-0.127541,0.354042,0.897880,-0.827971,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
275,-0.844742,-1.034075,-1.046867,-1.016767,0.860752,1.067595,-1.313579,-0.427760,1.926620,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
276,-0.844742,-0.820942,-0.915535,-0.858326,0.489473,1.609186,-1.313579,-0.427760,1.926620,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,3.591657,0.0,-0.230283,-0.135333


## Training

In [47]:
# Using a simple linear model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_r2 = linear_model.score(X_test, y_test)
print("Linear Regression R^2: {:.1f}".format(linear_r2))

Linear Regression R^2: -46168307322443243520.0


In [48]:
# Using a decision tree model
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)

tree_r2 = tree_model.score(X_test, y_test)
print("Decision Tree R^2: {:.5f}".format(tree_r2))

Decision Tree R^2: 0.77495


In [49]:
# Using random forest regression model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

rf_r2 = rf_model.score(X_test, y_test)
print("Random Forest R^2: {:.5f}".format(rf_r2))

Random Forest R^2: 0.88111
