In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

from feature_engine.selection import (
    DropConstantFeatures, 
    DropCorrelatedFeatures,
    SmartCorrelatedSelection
)


In [2]:
car_df = pd.read_csv('CarPrice_Assignment.csv')
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [3]:
car_df.select_dtypes(include='object').nunique()

CarName           147
fueltype            2
aspiration          2
doornumber          2
carbody             5
drivewheel          3
enginelocation      2
enginetype          7
cylindernumber      7
fuelsystem          8
dtype: int64

In [4]:
target = 'price'
numerical_columns = [col for col in car_df.columns if car_df[col].dtypes in ['int64', 'float64'] and car_df[col].name != target]
categorical_columns = [col for col in car_df.columns if car_df[col].dtypes == 'object' and car_df[col].name != target]

In [5]:
X = car_df[numerical_columns + categorical_columns]
y = car_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
dcf = DropConstantFeatures()

X_train_feature_transform = dcf.fit_transform(X_train)
X_test_feature_transform = dcf.transform(X_test)

# aqui construirmos um novo dataset, mas agora transformado, com as features selecionadas, porém, quase não usaremos assim, vamos por tudo isso dentro de um pipeline

In [7]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

In [8]:
column_transform = ColumnTransformer([
    ('categorial_transform', categorical_pipeline, categorical_columns),
    ('numerical_transform', numerical_pipeline, numerical_columns),
])

In [15]:
pipe = Pipeline([
    ('preprocessor', column_transform),
    ('drop_constant_features', DropConstantFeatures()),  # exclusão de features constantes
    ('drop_correlated_features', DropCorrelatedFeatures()),  # exclusão de features correlacionadas
    ('drop_smart_correlated_features', SmartCorrelatedSelection()),  # exclusão de features correlacionadas
    ('model', LinearRegression()),
])

In [16]:
pipe.fit(X_train, y_train)

In [11]:
y_pred = pipe.predict(X_test)

Mean Square Error: 350615667.674325
