In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
automobile = pd.read_csv("automobile_pre_processados.csv")
automobile.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,wheel-base,length,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,others,gas,std,two,convertible,rwd,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,3,,others,gas,std,two,convertible,rwd,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,1,,others,gas,std,two,hatchback,rwd,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450


In [3]:
automobile["make"] = automobile["make"].astype('category')
automobile["fuel-type"] = automobile["fuel-type"].astype('category')
automobile["aspiration"] = automobile["aspiration"].astype('category')
automobile["num-of-doors"] = automobile["num-of-doors"].astype('category')
automobile["body-style"] = automobile["body-style"].astype('category')
automobile["drive-wheels"] = automobile["drive-wheels"].astype('category')
automobile["engine-type"] = automobile["engine-type"].astype('category')
automobile["num-of-cylinders"] = automobile["num-of-cylinders"].astype('category')
automobile["fuel-system"] = automobile["fuel-system"].astype('category')

In [4]:
automobile["symboling"] = automobile["symboling"].astype('float64')
automobile["price"] = automobile["price"].astype('float64')
automobile["bore"] = automobile["bore"].astype('float64')
automobile["stroke"] = automobile["stroke"].astype('float64')
automobile["horsepower"] = automobile["horsepower"].astype('float64')
automobile["peak-rpm"] = automobile["peak-rpm"].astype('float64')
automobile["price"] = automobile["price"].astype('float64')

In [5]:
automobile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   symboling          201 non-null    float64 
 1   normalized-losses  164 non-null    float64 
 2   make               201 non-null    category
 3   fuel-type          201 non-null    category
 4   aspiration         201 non-null    category
 5   num-of-doors       201 non-null    category
 6   body-style         201 non-null    category
 7   drive-wheels       201 non-null    category
 8   wheel-base         201 non-null    float64 
 9   length             201 non-null    float64 
 10  width              201 non-null    float64 
 11  height             201 non-null    float64 
 12  curb-weight        201 non-null    int64   
 13  engine-type        201 non-null    category
 14  num-of-cylinders   201 non-null    category
 15  engine-size        201 non-null    int64   
 16  fuel-sys

In [13]:
numerical_features = ["symboling", "normalized-losses", "wheel-base", "length", "width", "height", "curb-weight", "engine-size", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg"]
categorical_features = ["make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-type", "num-of-cylinders", "fuel-system"]

In [15]:
class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("init")
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        make = pd.get_dummies(X["make"], prefix="make_")
        fuel_type = pd.get_dummies(X["fuel-type"], prefix="fuel_type_")
        aspiration = pd.get_dummies(X["aspiration"], prefix="aspiration_")
        num_of_doors = pd.get_dummies(X["num-of-doors"], prefix="num_of_doors_")
        body_style = pd.get_dummies(X["body-style"], prefix="body_style_")
        drive_wheels = pd.get_dummies(X["drive-wheels"], prefix="drive_wheels_")
        engine_type = pd.get_dummies(X["engine-type"], prefix="engine_type_")
        num_of_cylinders = pd.get_dummies(X["num-of-cylinders"], prefix="num_of_cylinders_")
        fuel_system = pd.get_dummies(X["fuel-system"], prefix="fuel_system_")
        
        X = X.drop(["make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-type", "num-of-cylinders", "fuel-system"], axis=1)
        X = pd.concat([X, make, fuel_type, aspiration, num_of_doors, body_style, drive_wheels, engine_type, num_of_cylinders, fuel_system], axis=1)
        self.X = X
        return X
    
    def get_feature_names(self):
        return self.X.columns

In [16]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])
    def get_feature_names(self):
        return X.columns.tolist()
    
class Debug(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        print(X[:,10])

modelo = Pipeline([
  ('features', FeatureUnion(
      transformer_list=[
          ('numericals', Pipeline([
              ('selector', TypeSelector(np.number)),
              ('imputer', SimpleImputer(strategy='mean')),
              ('scaler', MinMaxScaler())
          ])),
          ('categoricals', Pipeline([
              ('selector', TypeSelector('category')),
              ('get_dummies', GetDummies())              
          ]))
      ]
  )),
  ('model', Lasso())
])

init


In [17]:
X = automobile.drop("price", axis=1)
Y = automobile["price"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state = 3)
modelo.fit(x_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numericals',
                                                 Pipeline(steps=[('selector',
                                                                  TypeSelector(dtype=<class 'numpy.number'>)),
                                                                 ('imputer',
                                                                  SimpleImputer()),
                                                                 ('scaler',
                                                                  MinMaxScaler())])),
                                                ('categoricals',
                                                 Pipeline(steps=[('selector',
                                                                  TypeSelector(dtype='category')),
                                                                 ('get_dummies',
                                                                  GetDumm

In [18]:
y_pred = modelo.predict(x_test)
y_pred_train = modelo.predict(x_train)

In [19]:
r2_score(y_test, y_pred)
#r2_score(y_train, y_pred_train)

0.8432048972662233

In [20]:
features_names = numerical_features
categorical_features_names = modelo['features'].transformer_list[1][1]['get_dummies'].get_feature_names()

for i in range(0, len(categorical_features_names)):
    features_names.append(categorical_features_names.values[i])

In [21]:
len(features_names)

60

In [22]:
lasso = modelo.named_steps['model']
coef = pd.Series(lasso.coef_, features_names)
coef.sort_values(inplace=True)

In [23]:
coef

bore                               -1.247275e+04
make__subaru                       -1.223036e+04
length                             -9.051215e+03
stroke                             -8.498634e+03
engine_type__ohcv                  -5.665722e+03
engine_type__l                     -3.039555e+03
city-mpg                           -3.011030e+03
engine_type__dohc                  -2.249103e+03
height                             -1.946189e+03
make__mitsubishi                   -1.784091e+03
make__dodge                        -1.744035e+03
aspiration__std                    -1.400616e+03
make__plymouth                     -1.352268e+03
make__peugot                       -1.345171e+03
fuel_system__others                -1.265676e+03
fuel_system__spdi                  -8.833243e+02
symboling                          -8.025856e+02
drive_wheels__fwd                  -7.466790e+02
body_style__hatchback              -6.508291e+02
make__toyota                       -6.031663e+02
make__volvo         

In [27]:
mask_zero = coef.values == 0
zero = coef[mask_zero]
zero

drive_wheels__4wd         0.0
body_style__sedan         0.0
normalized-losses         0.0
num_of_cylinders__five    0.0
compression-ratio         0.0
dtype: float64