# Modelagem

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# bibliotecas para modelagem de dados

from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor 

import xgboost as xgb

# graficos de residuos

#from yellowbrick.regressor import ResidualsPlot
#from yellowbrick.regressor import PredictionError

####  Importação dos Dados

In [2]:
dados = pd.read_csv("C:\\Users\\Caio\\Desktop\\Linear Regression - Project\\Dados\\Car details v3.csv")

In [3]:
dados.head(3)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0


####  Tem algumas variáveis numéricas (mileage, engine, max_power) que estão mescladas com letras. Vamos retirar essas letras das colunas numéricas

In [3]:
# criando uma coluna com a idade dos automóveis
dados['age'] = 2021-dados['year'] 


# retirando todas as letras da coluna mileage, max_power eengine


dados['engine'] = dados['engine'].str.replace('\D', '')

dados['max_power'] = dados['max_power'].str.replace('\D', '')

dados['mileage'] = dados['mileage'].str.replace('\D', '')


# convertendo para numérico


dados['mileage'] = pd.to_numeric(dados['mileage'])

dados['max_power'] = pd.to_numeric(dados['max_power'])

dados['engine'] = pd.to_numeric(dados['engine'])


# Eliminando colunas que não iremos utilizar

dados.drop(['name', 'torque', 'year'], axis = 1, inplace=True)


  dados['engine'] = dados['engine'].str.replace('\D', '')
  dados['max_power'] = dados['max_power'].str.replace('\D', '')
  dados['mileage'] = dados['mileage'].str.replace('\D', '')


## Modelagem dos Dados

In [4]:
# LOG TRANSFORMATION

dados['selling_price'] = np.log(dados['selling_price'])
dados['km_driven'] = np.log(dados['km_driven'])
dados['engine'] = np.log(dados['engine'])


In [5]:
features = dados.drop('selling_price', axis = 1)
resposta = dados['selling_price']

#### Train test split

In [6]:
x_train, x_test, y_train, y_test = train_test_split(features, resposta, test_size = 0.2, random_state= 2)

In [7]:
print(x_train['engine'].median())
print(x_train['max_power'].median())
print(x_train['seats'].median())
print(x_train['mileage'].median())

7.129297548929373
739.0
5.0
240.0


### Pipelines dos seguintes modelos: Linear Regression, Random Forest e XGBoost

In [8]:
# Separando variáveis categóricas e numéricas

col_categorico = [coluna for coluna in x_train.columns if x_train[coluna].dtype.name == 'object']
col_numerico = [coluna for coluna in x_train.columns if x_train[coluna].dtype.name != 'object']

In [9]:
# Pipeline Linear Regression

numeric_transformer = Pipeline(
    steps = [('scaler', StandardScaler()),
             ('imputer', SimpleImputer(strategy='median'))]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, col_numerico),
        ("cat", categorical_transformer, col_categorico),
    ]
)

pipe = Pipeline(
    steps=[("preprocessor", preprocessor),("lin_reg", LinearRegression())]
)


In [10]:
# Pipeline Random Forest Regression

num_trans_rf = Pipeline(
    steps = [('missing', SimpleImputer(strategy='median'))]
)

preprocessor_rf = ColumnTransformer(
    transformers=[
        ('numerico', num_trans_rf, col_numerico),
        ('categorico', categorical_transformer, col_categorico)
    ]
)

pipe_rf = Pipeline(
    steps=[('preprocessor', preprocessor_rf), ('randomForestRegressor', RandomForestRegressor())]
)


In [11]:
# Pipeline XGBoost Regression



numeric_transformer = Pipeline(
    steps = [('scaler', StandardScaler()),
             ('imputer', SimpleImputer(strategy='median'))]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, col_numerico),
        ("cat", categorical_transformer, col_categorico),
    ]
)


xgboost = xgb.XGBRegressor

pipe_xgboost = Pipeline(
    steps=[("preprocessor", preprocessor),("xgboostR", xgboost())]
)


### Tuning Hyperparameters (Linear Regression não possui hiperparametros para tunar)

In [12]:
# Tuning Hyperparameters Random Forest Regressor


params = {'randomForestRegressor__n_estimators': [10,20,30,40],
          'randomForestRegressor__max_features': ['auto', 'sqrt'],
          'randomForestRegressor__max_depth':[10,20,30,40],
          'randomForestRegressor__min_samples_split':[4,6,10,13],
          'randomForestRegressor__min_samples_leaf':[4,7,10,15]}


pipe_rf_grid = GridSearchCV(estimator=pipe_rf,
                            param_grid=params,
                            scoring='r2',
                            cv = 3,
                            n_jobs=-1)

In [13]:
# Tuning Hyperparameters XGBOOST

params_xgboots  = {
    "xgboostR__learning_rate":[0.05, 0.10, 0.15],
    "xgboostR__max_depth": [3,5,6,8],
    "xgboostR__min_child_weight":[1,3,5,7],
    "xgboostR__gamma": [0.0,0.1,0.2]


}

pipe_xgboost = GridSearchCV(estimator=pipe_xgboost,
                            param_grid= params_xgboots,
                            scoring='r2',
                            cv = 3,
                            n_jobs=-1
)



#### Agora vamos treinar os 3 modelos

In [14]:
modelos = [pipe, pipe_rf_grid, pipe_xgboost]

In [15]:
for i in modelos:
    i.fit(x_train, y_train)

  warn(


### Cross Validation with 5 cv Folds

In [16]:
# Cross Validation Linear Regression

scores_LR = cross_validate(pipe, x_train, y_train, scoring = ['r2','neg_root_mean_squared_error'], cv = 5)

In [17]:
# Cross Validation Random Forest


scores_RF = cross_validate(pipe_rf_grid, x_train, y_train, scoring = ['r2','neg_root_mean_squared_error'], cv = 5)

  warn(
  warn(
  warn(
  warn(
  warn(


In [18]:
# XGBOOST

scores_XGBoost = cross_validate(pipe_xgboost, x_train, y_train, scoring = ['r2','neg_root_mean_squared_error'], cv = 5)

#### Como foi o desempenho dos modelos no Cross Validation

In [19]:
# Scores Cross Validation Linear Regression
print('METRICS CROSS VALIDATION LINEAR REGRESSION')
print('-----------------------------------------')


print('R2 Score')
contador = 0
contador2 = 0
for i in scores_LR['test_r2']:
    contador = contador + 1
    print("Fold {} - R2:    {:.4f}" .format(contador, i))


print('-----------------------------------------')

print('RMSE Score')

for j in scores_LR['test_neg_root_mean_squared_error']:
    contador2 = contador2 + 1
    print("Fold {} - RMSE: {}" .format(contador2, -j))

print('------------------------------------------')
print('Média do R2 e RMSE do Cross Validation - 5 Fold')
    
    
print("Média R2:    {}".format(scores_LR['test_r2'].mean()))
print("Média RMSE: {}".format(-scores_LR['test_neg_root_mean_squared_error'].mean()))

METRICS CROSS VALIDATION LINEAR REGRESSION
-----------------------------------------
R2 Score
Fold 1 - R2:    0.8154
Fold 2 - R2:    0.8075
Fold 3 - R2:    0.8347
Fold 4 - R2:    0.8247
Fold 5 - R2:    0.8179
-----------------------------------------
RMSE Score
Fold 1 - RMSE: 0.3568028987114164
Fold 2 - RMSE: 0.3732916364951511
Fold 3 - RMSE: 0.3395061881085028
Fold 4 - RMSE: 0.36072335850286164
Fold 5 - RMSE: 0.36061427091658865
------------------------------------------
Média do R2 e RMSE do Cross Validation - 5 Fold
Média R2:    0.8200401668162549
Média RMSE: 0.35818767054690415


In [20]:
# Scores Cross Validation RANDOM FOREST
print('METRICS CROSS VALIDATION RANDOM FOREST')
print('-----------------------------------------')


print('R2 Score')
contador = 0
contador2 = 0
for i in scores_RF['test_r2']:
    contador = contador + 1
    print("Fold {} - R2:    {:.4f}" .format(contador, i))


print('-----------------------------------------')

print('RMSE Score')

for j in scores_RF['test_neg_root_mean_squared_error']:
    contador2 = contador2 + 1
    print("Fold {} - RMSE: {}" .format(contador2, -j))

print('------------------------------------------')
print('Média do R2 e RMSE do Cross Validation - 5 Fold')
    
    
print("Média R2:    {}".format(scores_RF['test_r2'].mean()))
print("Média RMSE: {}".format(-scores_RF['test_neg_root_mean_squared_error'].mean()))

METRICS CROSS VALIDATION RANDOM FOREST
-----------------------------------------
R2 Score
Fold 1 - R2:    0.9150
Fold 2 - R2:    0.9220
Fold 3 - R2:    0.9319
Fold 4 - R2:    0.9213
Fold 5 - R2:    0.9141
-----------------------------------------
RMSE Score
Fold 1 - RMSE: 0.2421737120546579
Fold 2 - RMSE: 0.23761818308618518
Fold 3 - RMSE: 0.2178741668380389
Fold 4 - RMSE: 0.24169867493966374
Fold 5 - RMSE: 0.24770861841895161
------------------------------------------
Média do R2 e RMSE do Cross Validation - 5 Fold
Média R2:    0.9208517238267477
Média RMSE: 0.23741467106749944


In [21]:
# Scores Cross Validation XGBOOST
print('METRICS CROSS VALIDATION RANDOM FOREST')
print('-----------------------------------------')


print('R2 Score')
contador = 0
contador2 = 0
for i in scores_XGBoost['test_r2']:
    contador = contador + 1
    print("Fold {} - R2:    {:.4f}" .format(contador, i))


print('-----------------------------------------')

print('RMSE Score')

for j in scores_XGBoost['test_neg_root_mean_squared_error']:
    contador2 = contador2 + 1
    print("Fold {} - RMSE: {}" .format(contador2, -j))

print('------------------------------------------')
print('Média do R2 e RMSE do Cross Validation - 5 Fold')
    
    
print("Média R2:    {}".format(scores_XGBoost['test_r2'].mean()))
print("Média RMSE: {}".format(-scores_XGBoost['test_neg_root_mean_squared_error'].mean()))

METRICS CROSS VALIDATION RANDOM FOREST
-----------------------------------------
R2 Score
Fold 1 - R2:    0.9278
Fold 2 - R2:    0.9296
Fold 3 - R2:    0.9406
Fold 4 - R2:    0.9347
Fold 5 - R2:    0.9299
-----------------------------------------
RMSE Score
Fold 1 - RMSE: 0.2231662689230814
Fold 2 - RMSE: 0.22578517313213062
Fold 3 - RMSE: 0.20344632222948253
Fold 4 - RMSE: 0.22018463053460627
Fold 5 - RMSE: 0.22376590877717475
------------------------------------------
Média do R2 e RMSE do Cross Validation - 5 Fold
Média R2:    0.9325144898661126
Média RMSE: 0.21926966071929513


In [22]:
xgboost_media_r2 =scores_XGBoost['test_r2'].mean()
xgboost_media_rmse = scores_XGBoost['test_neg_root_mean_squared_error'].mean()

lista = ['Linear_Regression', 'RF', 'XGboost']
metricas = ['R2', 'RMSE']


#### Vamos comparar as métricas dos modelos

In [23]:
print('Comparativo entre as métricas Cross Validation dos Modelos')
print('-----------------------------------------------------------')

print("Linear Regression:")
print("R2 mean: {}".format(scores_LR['test_r2'].mean()))
print("RMSE mean: {}".format(-scores_LR['test_neg_root_mean_squared_error'].mean()))
print('-----------------------------------------------------------')

print("Random Forest Regression:")
print("R2 mean: {}".format(scores_RF['test_r2'].mean()))
print("RMSE mean: {}".format(-scores_RF['test_neg_root_mean_squared_error'].mean()))

print('-----------------------------------------------------------')

print("XGBOOST Regression:")
print("R2 mean: {}".format(scores_XGBoost['test_r2'].mean()))
print("RMSE mean: {}".format(-scores_XGBoost['test_neg_root_mean_squared_error'].mean()))



Comparativo entre as métricas Cross Validation dos Modelos
-----------------------------------------------------------
Linear Regression:
R2 mean: 0.8200401668162549
RMSE mean: 0.35818767054690415
-----------------------------------------------------------
Random Forest Regression:
R2 mean: 0.9208517238267477
RMSE mean: 0.23741467106749944
-----------------------------------------------------------
XGBOOST Regression:
R2 mean: 0.9325144898661126
RMSE mean: 0.21926966071929513


#### Iremos selecionar o modelo do XGBoost


#### Vamos a base de teste

In [24]:
final_predictions = pipe_xgboost.predict(x_test)

In [25]:
mse_test = mean_squared_error(y_test, final_predictions)
r_quadrado_test = r2_score(y_test, final_predictions)

In [26]:
print("Mean Squared Error Test: {} ".format(np.sqrt(mse_test)))
print("R2 Test: {}".format(r_quadrado_test))

Mean Squared Error Test: 0.2136232460729033 
R2 Test: 0.9314417674366186
