In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
automobile = pd.read_csv("automobile_pre_processados.csv")
automobile.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
1,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
2,1,158,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
3,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
4,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430


In [3]:
automobile["make"] = automobile["make"].astype('category')
#automobile["normalized-losses"] = automobile["normalized-losses"].astype('category')
automobile["fuel-type"] = automobile["fuel-type"].astype('category')
automobile["aspiration"] = automobile["aspiration"].astype('category')
automobile["num-of-doors"] = automobile["num-of-doors"].astype('category')
automobile["body-style"] = automobile["body-style"].astype('category')
automobile["drive-wheels"] = automobile["drive-wheels"].astype('category')
automobile["engine-location"] = automobile["engine-location"].astype('category')
automobile["engine-type"] = automobile["engine-type"].astype('category')
automobile["num-of-cylinders"] = automobile["num-of-cylinders"].astype('category')
automobile["fuel-system"] = automobile["fuel-system"].astype('category')

In [4]:
automobile["symboling"] = automobile["symboling"].astype('float64')
#automobile["normalized-losses"] = automobile["normalized-losses"].astype('float64')
automobile["price"] = automobile["price"].astype('float64')
automobile["bore"] = automobile["bore"].astype('float64')
automobile["stroke"] = automobile["stroke"].astype('float64')
automobile["horsepower"] = automobile["horsepower"].astype('float64')
automobile["peak-rpm"] = automobile["peak-rpm"].astype('float64')
automobile["price"] = automobile["price"].astype('float64')

In [5]:
automobile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   symboling          159 non-null    float64 
 1   normalized-losses  159 non-null    int64   
 2   make               159 non-null    category
 3   fuel-type          159 non-null    category
 4   aspiration         159 non-null    category
 5   num-of-doors       159 non-null    category
 6   body-style         159 non-null    category
 7   drive-wheels       159 non-null    category
 8   engine-location    159 non-null    category
 9   wheel-base         159 non-null    float64 
 10  length             159 non-null    float64 
 11  width              159 non-null    float64 
 12  height             159 non-null    float64 
 13  curb-weight        159 non-null    int64   
 14  engine-type        159 non-null    category
 15  num-of-cylinders   159 non-null    category
 16  engine-s

In [6]:
class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("init")
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        make = pd.get_dummies(X["make"])
        fuel_type = pd.get_dummies(X["fuel-type"])
        aspiration = pd.get_dummies(X["aspiration"])
        num_of_doors = pd.get_dummies(X["num-of-doors"])
        body_style = pd.get_dummies(X["body-style"])
        drive_wheels = pd.get_dummies(X["drive-wheels"])
        engine_location = pd.get_dummies(X["engine-location"])
        engine_type = pd.get_dummies(X["engine-type"])
        num_of_cylinders = pd.get_dummies(X["num-of-cylinders"])
        fuel_system = pd.get_dummies(X["fuel-system"])
        
        X = X.drop(["make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "engine-type", "num-of-cylinders", "fuel-system"], axis=1)
        X = pd.concat([X, make, fuel_type, aspiration, num_of_doors, body_style, drive_wheels, engine_location, engine_type, num_of_cylinders, fuel_system], axis=1)
        return X

In [8]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])
    
class Debug(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        print(X[:,10])

modelo = Pipeline([
  ('features', FeatureUnion(
      transformer_list=[
          ('numericals', Pipeline([
              ('selector', TypeSelector(np.number)),
              #('imputer', SimpleImputer(strategy='mean')),
              #('debug', Debug(np.number)),
              ('scaler', MinMaxScaler())
          ])),
          ('categoricals', Pipeline([
              ('selector', TypeSelector('category')),
              ('get_dummies', GetDummies())              
          ]))
      ]
  )),
  ('model', Ridge())
])

init


In [9]:
X = automobile.drop("price", axis=1)
Y = automobile["price"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state = 3)
modelo.fit(x_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numericals',
                                                 Pipeline(steps=[('selector',
                                                                  TypeSelector(dtype=<class 'numpy.number'>)),
                                                                 ('scaler',
                                                                  MinMaxScaler())])),
                                                ('categoricals',
                                                 Pipeline(steps=[('selector',
                                                                  TypeSelector(dtype='category')),
                                                                 ('get_dummies',
                                                                  GetDummies())]))])),
                ('model', Ridge())])

In [10]:
y_pred = modelo.predict(x_test)
y_pred_train = modelo.predict(x_train)

In [12]:
r2_score(y_test, y_pred)
#r2_score(y_train, y_pred_train)

0.9301879939710271