In [227]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin

In [237]:
automobile = pd.read_csv("automobile_pre_processados.csv")
automobile.head()

Unnamed: 0,symboling,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [262]:
automobile["num-of-cylinders"].value_counts()

four      153
six        24
five       10
eight       4
twelve      1
three       1
Name: num-of-cylinders, dtype: int64

In [238]:
automobile["make"] = automobile["make"].astype('category')
#automobile["normalized-losses"] = automobile["normalized-losses"].astype('category')
automobile["fuel-type"] = automobile["fuel-type"].astype('category')
automobile["aspiration"] = automobile["aspiration"].astype('category')
automobile["num-of-doors"] = automobile["num-of-doors"].astype('category')
automobile["body-style"] = automobile["body-style"].astype('category')
automobile["drive-wheels"] = automobile["drive-wheels"].astype('category')
automobile["engine-location"] = automobile["engine-location"].astype('category')
automobile["engine-type"] = automobile["engine-type"].astype('category')
automobile["num-of-cylinders"] = automobile["num-of-cylinders"].astype('category')
automobile["fuel-system"] = automobile["fuel-system"].astype('category')

In [239]:
automobile["symboling"] = automobile["symboling"].astype('float64')
#automobile["normalized-losses"] = automobile["normalized-losses"].astype('float64')
automobile["price"] = automobile["price"].astype('float64')
automobile["bore"] = automobile["bore"].astype('float64')
automobile["stroke"] = automobile["stroke"].astype('float64')
automobile["horsepower"] = automobile["horsepower"].astype('float64')
automobile["peak-rpm"] = automobile["peak-rpm"].astype('float64')
automobile["price"] = automobile["price"].astype('float64')

In [240]:
automobile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   symboling          193 non-null    float64 
 1   make               193 non-null    category
 2   fuel-type          193 non-null    category
 3   aspiration         193 non-null    category
 4   num-of-doors       193 non-null    category
 5   body-style         193 non-null    category
 6   drive-wheels       193 non-null    category
 7   engine-location    193 non-null    category
 8   wheel-base         193 non-null    float64 
 9   length             193 non-null    float64 
 10  width              193 non-null    float64 
 11  height             193 non-null    float64 
 12  curb-weight        193 non-null    int64   
 13  engine-type        193 non-null    category
 14  num-of-cylinders   193 non-null    category
 15  engine-size        193 non-null    int64   
 16  fuel-sys

In [241]:
class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("init")
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        make = pd.get_dummies(X["make"])
        fuel_type = pd.get_dummies(X["fuel-type"])
        aspiration = pd.get_dummies(X["aspiration"])
        num_of_doors = pd.get_dummies(X["num-of-doors"])
        body_style = pd.get_dummies(X["body-style"])
        drive_wheels = pd.get_dummies(X["drive-wheels"])
        engine_location = pd.get_dummies(X["engine-location"])
        engine_type = pd.get_dummies(X["engine-type"])
        num_of_cylinders = pd.get_dummies(X["num-of-cylinders"])
        fuel_system = pd.get_dummies(X["fuel-system"])
        
        X = X.drop(["make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "engine-type", "num-of-cylinders", "fuel-system"], axis=1)
        X = pd.concat([X, make, fuel_type, aspiration, num_of_doors, body_style, drive_wheels, engine_location, engine_type, num_of_cylinders, fuel_system], axis=1)
        return X

In [242]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])
    
class Debug(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        print(X[:,10])

modelo = Pipeline([
  ('features', FeatureUnion(
      transformer_list=[
          ('numericals', Pipeline([
              ('selector', TypeSelector(np.number)),
              #('imputer', SimpleImputer(strategy='mean')),
              #('debug', Debug(np.number)),
              ('scaler', MinMaxScaler())
          ])),
          ('categoricals', Pipeline([
              ('selector', TypeSelector('category')),
              ('get_dummies', GetDummies())              
          ]))
      ]
  )),
  ('model', LinearRegression())
])

init


In [252]:
X = automobile.drop("price", axis=1)
Y = automobile["price"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state = 123)
modelo.fit(x_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numericals',
                                                 Pipeline(steps=[('selector',
                                                                  TypeSelector(dtype=<class 'numpy.number'>)),
                                                                 ('scaler',
                                                                  MinMaxScaler())])),
                                                ('categoricals',
                                                 Pipeline(steps=[('selector',
                                                                  TypeSelector(dtype='category')),
                                                                 ('get_dummies',
                                                                  GetDummies())]))])),
                ('model', LinearRegression())])

In [253]:
y_pred = modelo.predict(x_test)
y_pred_train = modelo.predict(x_train)

In [254]:
y_pred

array([6.72000000e+03, 1.80320000e+04, 1.06720000e+04, 9.15200000e+03,
       5.39200000e+03, 9.88800000e+03, 1.00160000e+04, 1.64640000e+04,
       1.62400000e+04, 9.08800000e+03, 9.06992336e+15, 7.66400000e+03,
       5.52000000e+03, 3.87520000e+04, 1.49120000e+04, 1.15680000e+04,
       6.86400000e+03, 1.70240000e+04, 8.56000000e+03, 5.72800000e+03,
       9.08800000e+03, 4.36518832e+16, 9.34400000e+03, 8.92800000e+03,
       1.15360000e+04, 1.53120000e+04, 8.43200000e+03, 1.64000000e+04,
       7.48800000e+03, 6.56000000e+03, 8.97600000e+03, 5.40800000e+03,
       9.07200000e+03, 1.31360000e+04, 8.54400000e+03, 6.46400000e+03,
       7.40800000e+03, 6.75200000e+03, 6.40000000e+03, 4.91200000e+03,
       1.08800000e+04, 1.07680000e+04, 5.20000000e+03, 1.06720000e+04,
       5.98400000e+03, 2.10720000e+04, 7.20000000e+03, 6.64000000e+03,
       9.16800000e+03, 6.17600000e+03, 1.59520000e+04, 1.35840000e+04,
       2.07680000e+04, 9.50400000e+03, 3.74400000e+03, 1.15680000e+04,
      

In [226]:
r2_score(y_test, y_pred)
#r2_score(y_train, y_pred_train)

0.839640487138459

In [259]:
pd.set_option('display.max_columns', None)

x_test.loc[10, :]

symboling                0
make                   bmw
fuel-type              gas
aspiration             std
num-of-doors          four
body-style           sedan
drive-wheels           rwd
engine-location      front
wheel-base           101.2
length               176.8
width                 64.8
height                54.3
curb-weight           2395
engine-type            ohc
num-of-cylinders      four
engine-size            108
fuel-system           mpfi
bore                   3.5
stroke                 2.8
compression-ratio      8.8
horsepower             101
peak-rpm              5800
city-mpg                23
highway-mpg             29
Name: 10, dtype: object

In [248]:
y_pred[-5:-2]

array([ 5.94885773e+13,  7.67348689e+15, -4.05312781e+14])

In [249]:
y_test[-5:-2]

17     5151.0
42    11048.0
41     6785.0
Name: price, dtype: float64