In [1]:
%pylab inline

import pandas as pd
import plotly.express as px

from sklearn.model_selection import train_test_split

Populating the interactive namespace from numpy and matplotlib


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Fixar comportamento aleatório
random.seed(38)

**Objetivo**
- Predizer o valor da tarifa do seguro de vida de um morador dos EUA baseado em suas informações pessoais.

# Leitura dos Dados

In [4]:
df = pd.read_csv('drive/MyDrive/ASA Arcelor Mittal 2023/data/dataset seguro - regressão.csv')
df.head(10)

Unnamed: 0,idade,sexo,imc,filhos,fumante,região,tarifa
0,19,feminino,27.9,0.0,sim,sudoeste,16884.924
1,18,masculino,33.77,1.0,nao,sudeste,1725.5523
2,28,masculino,33.0,3.0,nao,sudeste,4449.462
3,33,masculino,22.705,0.0,nao,noroeste,21984.47061
4,32,masculino,28.88,0.0,nao,noroeste,3866.8552
5,31,feminino,25.74,0.0,nao,sudeste,3756.6216
6,46,feminino,33.44,1.0,nao,sudeste,8240.5896
7,37,feminino,27.74,3.0,nao,noroeste,7281.5056
8,37,masculino,29.83,2.0,nao,nordeste,6406.4107
9,60,feminino,25.84,0.0,nao,noroeste,28923.13692


Note bem a ordem de grandeza dos valores de tarifa. Isso será importante mais pra frente.

In [5]:
px.histogram(df['tarifa'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   idade    1338 non-null   int64  
 1   sexo     1335 non-null   object 
 2   imc      1337 non-null   float64
 3   filhos   1336 non-null   float64
 4   fumante  1336 non-null   object 
 5   região   1337 non-null   object 
 6   tarifa   1338 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 73.3+ KB


In [7]:
df.nunique().sort_values()

sexo          2
fumante       2
região        4
filhos        6
idade        47
imc         548
tarifa     1337
dtype: int64

Já sabemos que sexo, fumante e região são categóricas pelo seu tipo (texto). Mas e a coluna filhos? Ela tem poucos valores distintos...

**Isso demonstra a importância de conhecer a variável (seu tipo e comportamento).**

# Tratamento dos dados
Dependendo do algoritmo utilizado, algumas etapas de tratamento dos dados podem ser necessárias:
+ Tratamento de nulos
+ *Encoding*
+ Normalização

Como vamos usar regressão linear e esse algoritmo não possui metodologia própria para lidar com essas situações, todas as etapas citadas são necessárias.

## Tratamento de Nulos
Para realizar o tratamento de nulos é, mais uma vez, importante conhecer seu dado.

Ele é quantitativo discreto? Quantitativo contínuo? Categórico? Cada um tem uma maneira de lidar.



In [8]:
df.isna().sum(axis=0)

idade      0
sexo       3
imc        1
filhos     2
fumante    2
região     1
tarifa     0
dtype: int64

In [9]:
df.isna().sum(axis=1)

0       0
1       0
2       0
3       0
4       0
       ..
1333    0
1334    0
1335    0
1336    0
1337    0
Length: 1338, dtype: int64

In [10]:
df.loc[df.isna().sum(axis=1) > 0, :]

Unnamed: 0,idade,sexo,imc,filhos,fumante,região,tarifa
40,24,feminino,26.6,,nao,nordeste,3046.062
176,38,masculino,27.835,2.0,,noroeste,6455.86265
1158,20,,,,,,2459.7201
1278,39,,29.925,1.0,sim,nordeste,22462.04375
1302,25,,20.8,1.0,nao,sudoeste,3208.787


In [11]:
df_clean = df.drop(index=1158)
df_clean['sexo'].value_counts()

masculino    675
feminino     660
Name: sexo, dtype: int64

In [12]:
px.bar(df_clean['sexo'].value_counts())

In [13]:
px.bar(df_clean['filhos'].value_counts())

In [14]:
from sklearn.impute import SimpleImputer

cat_imp = SimpleImputer(strategy='most_frequent')
cat_imp.fit(df_clean[['sexo', 'filhos', 'fumante']]) # aprende

In [15]:
cat_imp.transform(df_clean[['sexo', 'filhos', 'fumante']]) # preenche e devolve um numpy array

array([['feminino', 0.0, 'sim'],
       ['masculino', 1.0, 'nao'],
       ['masculino', 3.0, 'nao'],
       ...,
       ['feminino', 0.0, 'nao'],
       ['feminino', 0.0, 'nao'],
       ['feminino', 0.0, 'sim']], dtype=object)

In [16]:
df_clean[['sexo', 'filhos', 'fumante']] = cat_imp.transform(df_clean[['sexo', 'filhos', 'fumante']]) # devolve para o formato df

# Testar se ainda existem dados nulos
df_clean.loc[df_clean.isna().sum(axis=1) > 0, :]

Unnamed: 0,idade,sexo,imc,filhos,fumante,região,tarifa


In [17]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   idade    1337 non-null   int64  
 1   sexo     1337 non-null   object 
 2   imc      1337 non-null   float64
 3   filhos   1337 non-null   object 
 4   fumante  1337 non-null   object 
 5   região   1337 non-null   object 
 6   tarifa   1337 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 83.6+ KB


## *Encoding*
Para fazer o encoding das variáveis de texto, precisamos analisar quantas categorias são e qual tipo de codificação vamos usar.

No nosso exemplo, fumante e sexo possuem apenas 2 valores distintos, enquanto região possui 4.

In [18]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder(dtype=int)
ord_enc.fit(df_clean[['sexo','fumante']])

df_enc = df_clean.copy()
df_enc[['sexo','fumante']] = ord_enc.transform(df_clean[['sexo','fumante']]) # aplicando transform e já colocando dentro de um df (em vez de array)
df_enc

Unnamed: 0,idade,sexo,imc,filhos,fumante,região,tarifa
0,19,0,27.900,0.0,1,sudoeste,16884.92400
1,18,1,33.770,1.0,0,sudeste,1725.55230
2,28,1,33.000,3.0,0,sudeste,4449.46200
3,33,1,22.705,0.0,0,noroeste,21984.47061
4,32,1,28.880,0.0,0,noroeste,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3.0,0,noroeste,10600.54830
1334,18,0,31.920,0.0,0,nordeste,2205.98080
1335,18,0,36.850,0.0,0,sudeste,1629.83350
1336,21,0,25.800,0.0,0,sudoeste,2007.94500


In [19]:
df_enc = pd.get_dummies(df_enc, columns=['região'])
df_enc

Unnamed: 0,idade,sexo,imc,filhos,fumante,tarifa,região_nordeste,região_noroeste,região_sudeste,região_sudoeste
0,19,0,27.900,0.0,1,16884.92400,0,0,0,1
1,18,1,33.770,1.0,0,1725.55230,0,0,1,0
2,28,1,33.000,3.0,0,4449.46200,0,0,1,0
3,33,1,22.705,0.0,0,21984.47061,0,1,0,0
4,32,1,28.880,0.0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3.0,0,10600.54830,0,1,0,0
1334,18,0,31.920,0.0,0,2205.98080,1,0,0,0
1335,18,0,36.850,0.0,0,1629.83350,0,0,1,0
1336,21,0,25.800,0.0,0,2007.94500,0,0,0,1


**É importante manter o argumento drop_first = True porque ele remove uma das colunas do One Hot Encoding. Isso é necessário para a regressão linear para evitar problemas na resolução do sistema linear.**

In [20]:
df_enc.dtypes

idade                int64
sexo                 int64
imc                float64
filhos              object
fumante              int64
tarifa             float64
região_nordeste      uint8
região_noroeste      uint8
região_sudeste       uint8
região_sudoeste      uint8
dtype: object

In [21]:
df_enc['filhos'] = df_enc['filhos'].astype(int)
df_enc

Unnamed: 0,idade,sexo,imc,filhos,fumante,tarifa,região_nordeste,região_noroeste,região_sudeste,região_sudoeste
0,19,0,27.900,0,1,16884.92400,0,0,0,1
1,18,1,33.770,1,0,1725.55230,0,0,1,0
2,28,1,33.000,3,0,4449.46200,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,0,1,0,0
1334,18,0,31.920,0,0,2205.98080,1,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,0,1


## Criação de Features
Existem algumas técnicas de criação de features que podem nos ajudar na explicação do problema estudado.

Para variáveis numéricas:
+ Elevar ao quadrado, raiz quadrada, entre outros;
+ Multiplicar (dividir) uma variável por outra;

In [22]:
df_enc['imc**2'] = df_enc['imc']**2
df_enc['imc_raiz_2'] = df_enc['imc']**0.5
df_enc['imc*idade'] = df_enc['imc'] * df_enc['idade']
df_enc['imc/idade'] = df_enc['imc'] / df_enc['idade']

df_enc

Unnamed: 0,idade,sexo,imc,filhos,fumante,tarifa,região_nordeste,região_noroeste,região_sudeste,região_sudoeste,imc**2,imc_raiz_2,imc*idade,imc/idade
0,19,0,27.900,0,1,16884.92400,0,0,0,1,778.410000,5.282045,530.100,1.468421
1,18,1,33.770,1,0,1725.55230,0,0,1,0,1140.412900,5.811196,607.860,1.876111
2,28,1,33.000,3,0,4449.46200,0,0,1,0,1089.000000,5.744563,924.000,1.178571
3,33,1,22.705,0,0,21984.47061,0,1,0,0,515.517025,4.764976,749.265,0.688030
4,32,1,28.880,0,0,3866.85520,0,1,0,0,834.054400,5.374012,924.160,0.902500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,0,1,0,0,959.140900,5.565070,1548.500,0.619400
1334,18,0,31.920,0,0,2205.98080,1,0,0,0,1018.886400,5.649779,574.560,1.773333
1335,18,0,36.850,0,0,1629.83350,0,0,1,0,1357.922500,6.070420,663.300,2.047222
1336,21,0,25.800,0,0,2007.94500,0,0,0,1,665.640000,5.079370,541.800,1.228571


In [23]:
df_corr = df_enc.corr()
df_corr

Unnamed: 0,idade,sexo,imc,filhos,fumante,tarifa,região_nordeste,região_noroeste,região_sudeste,região_sudoeste,imc**2,imc_raiz_2,imc*idade,imc/idade
idade,1.0,-0.023429,0.109336,0.041582,-0.025558,0.298392,0.004292,-0.000988,-0.012278,0.009445,0.100678,0.113045,0.879715,-0.820145
sexo,-0.023429,1.0,0.043961,0.016369,0.075079,0.055412,-0.001934,-0.012438,0.015749,-0.001975,0.044026,0.043769,0.003435,0.035112
imc,0.109336,0.043961,1.0,0.012755,0.003746,0.198392,-0.138302,-0.136017,0.270057,-0.006211,0.991403,0.997771,0.539892,0.368355
filhos,0.041582,0.016369,0.012755,1.0,0.007331,0.067432,-0.021637,0.024432,-0.023492,0.021538,0.012401,0.012942,0.042434,-0.118412
fumante,-0.025558,0.075079,0.003746,0.007331,1.0,0.787223,0.003487,-0.037168,0.068282,-0.037168,0.007007,0.002069,-0.023395,0.024875
tarifa,0.298392,0.055412,0.198392,0.067432,0.787223,1.0,0.007541,-0.0403,0.073605,-0.043607,0.192967,0.199465,0.334252,-0.158892
região_nordeste,0.004292,-0.001934,-0.138302,-0.021637,0.003487,0.007541,1.0,-0.319841,-0.345205,-0.319841,-0.132926,-0.140473,-0.055662,-0.066545
região_noroeste,-0.000988,-0.012438,-0.136017,0.024432,-0.037168,-0.0403,-0.319841,1.0,-0.346614,-0.321146,-0.14392,-0.130996,-0.065186,-0.060797
região_sudeste,-0.012278,0.015749,0.270057,-0.023492,0.068282,0.073605,-0.345205,-0.346614,1.0,-0.346614,0.279168,0.264008,0.103801,0.147649
região_sudoeste,0.009445,-0.001975,-0.006211,0.021538,-0.037168,-0.043607,-0.319841,-0.321146,-0.346614,1.0,-0.013128,-0.002789,0.013017,-0.026014


Não podemos colocar features que são linearmente dependentes.

In [24]:
df_enc = df_enc.drop(columns=['imc**2', 'imc_raiz_2'])

## Separação treino-teste

In [25]:
# Seleciona aleatoriamente 70% da base como treino e designa os dados restantes como teste
# stratify gerante o balanceamento das classes nos dados de treino e teste
data_train, data_test = train_test_split(
  df_enc,
  test_size=0.3,
  random_state = 38
)

print(data_train.shape)
print(data_test.shape)

(935, 12)
(402, 12)


In [26]:
target_col = 'tarifa'

y_train = data_train[target_col].rename('y').to_frame()
X_train = data_train.drop(columns=target_col)

y_test = data_test[target_col].rename('y').to_frame()
X_test = data_test.drop(columns=target_col)

In [27]:
y_train.head(2)

Unnamed: 0,y
912,14382.70905
931,6238.298


In [28]:
X_train.head()

Unnamed: 0,idade,sexo,imc,filhos,fumante,região_nordeste,região_noroeste,região_sudeste,região_sudoeste,imc*idade,imc/idade
912,59,0,26.695,3,0,0,1,0,0,1575.005,0.452458
931,39,0,32.5,1,0,0,0,0,1,1267.5,0.833333
662,32,0,31.54,1,0,1,0,0,0,1009.28,0.985625
859,57,1,28.1,0,0,0,0,0,1,1601.7,0.492982
935,59,0,27.5,0,0,0,0,0,1,1622.5,0.466102


## Rescaling dos dados

In [29]:
from sklearn.preprocessing import MinMaxScaler

# OBS.: para alguns algoritmos essa etapa não é necessária
# algoritmos baseados em árvores rescaling é indiferente
# já regressão linear e logistica, redes neurais, SVM, K-means, etc
# rescaling pode ter significante impacto na performance do modelo

# para não haver risco "vazamento" de informação dos dados de treino para os dados de teste
# o fit do scaler deve ser feito nos dados de treino e depois transformar ambos, teste e treino
feature_scaler = MinMaxScaler()
feature_scaler.fit(X_train)

# a target precisa ser normalizada também
target_scaler = MinMaxScaler()
target_scaler.fit(y_train)

X_train = pd.DataFrame(feature_scaler.transform(X_train), columns=X_train.columns)
y_train = pd.DataFrame(target_scaler.transform(y_train), columns=y_train.columns)

X_test = pd.DataFrame(feature_scaler.transform(X_test), columns=X_test.columns)
y_test = pd.DataFrame(target_scaler.transform(y_test), columns=y_test.columns)

# Testar sem normalizar

In [30]:
X_train.head()

Unnamed: 0,idade,sexo,imc,filhos,fumante,região_nordeste,região_noroeste,região_sudeste,região_sudoeste,imc*idade,imc/idade
0,0.891304,0.0,0.272064,0.6,0.0,0.0,1.0,0.0,0.0,0.494511,0.055506
1,0.456522,0.0,0.431915,0.2,0.0,0.0,0.0,0.0,1.0,0.372163,0.199446
2,0.304348,0.0,0.40548,0.2,0.0,1.0,0.0,0.0,0.0,0.269424,0.256999
3,0.847826,1.0,0.310753,0.0,0.0,0.0,0.0,0.0,1.0,0.505133,0.070821
4,0.891304,0.0,0.294231,0.0,0.0,0.0,0.0,0.0,1.0,0.513408,0.060663


In [31]:
y_train.head()

Unnamed: 0,y
0,0.21167
1,0.081669
2,0.064274
3,0.157124
4,0.17737


# Treinamento do modelo

Vale se certificar que as colunas dos dados de treino e teste são iguais

In [32]:
X_test = X_test[X_train.columns].copy()
assert (X_train.columns == X_test.columns).all()

In [33]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=False)
model.fit(X_train, y_train)

y_train_complete = y_train.copy()
y_test_complete = y_test.copy()

y_train_complete['y_hat'] = model.predict(X_train)
y_test_complete['y_hat'] = model.predict(X_test)

In [34]:
y_train_complete.head()

Unnamed: 0,y,y_hat
0,0.21167,0.194109
1,0.081669,0.117592
2,0.064274,0.10179
3,0.157124,0.156823
4,0.17737,0.165239


In [35]:
y_test_complete.head()

Unnamed: 0,y,y_hat
0,0.373733,0.529448
1,0.017387,0.074211
2,0.014339,0.056739
3,0.189302,0.238709
4,0.730542,0.611175


In [36]:
# Coeficientes da regressão
df_coef = pd.DataFrame(
    data=model.coef_,
    columns=X_train.columns
)
df_coef

Unnamed: 0,idade,sexo,imc,filhos,fumante,região_nordeste,região_noroeste,região_sudeste,região_sudoeste,imc*idade,imc/idade
0,0.171331,-0.003082,0.114167,0.052007,0.392099,-0.053814,-0.068291,-0.073125,-0.07058,0.085548,0.092286


## Coleta de métricas

In [37]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df_metrics = pd.DataFrame()
df_metrics['R2 Treino'] = [r2_score(y_train_complete['y'], y_train_complete['y_hat'])]
df_metrics['R2 Teste'] = r2_score(y_test_complete['y'], y_test_complete['y_hat'])

df_metrics['MAE Treino'] = mean_absolute_error(y_train_complete['y'], y_train_complete['y_hat'])
df_metrics['MAE Teste'] = mean_absolute_error(y_test_complete['y'], y_test_complete['y_hat'])

df_metrics['MSE Treino'] = mean_squared_error(y_train_complete['y'], y_train_complete['y_hat'])
df_metrics['MSE Teste'] = mean_squared_error(y_test_complete['y'], y_test_complete['y_hat'])

df_metrics

Unnamed: 0,R2 Treino,R2 Teste,MAE Treino,MAE Teste,MSE Treino,MSE Teste
0,0.768341,0.700869,0.064736,0.070266,0.009056,0.009896


Esses valores estão certos? A escala do Erro Médio Absoluto (MAE) está correta?

In [38]:
y_train_rescaled = y_train_complete.copy()
y_test_rescaled = y_test_complete.copy()

y_train_rescaled['y'] = target_scaler.inverse_transform(y_train_rescaled['y'].to_frame())
y_train_rescaled['y_hat'] = target_scaler.inverse_transform(y_train_rescaled['y_hat'].to_frame())

y_test_rescaled['y'] = target_scaler.inverse_transform(y_test_rescaled['y'].to_frame())
y_test_rescaled['y_hat'] = target_scaler.inverse_transform(y_test_rescaled['y_hat'].to_frame())

df_metrics = pd.DataFrame()
df_metrics['R2 Treino'] = [r2_score(y_train_rescaled['y'], y_train_rescaled['y_hat'])]
df_metrics['R2 Teste'] = r2_score(y_test_rescaled['y'], y_test_rescaled['y_hat'])

df_metrics['MAE Treino'] = mean_absolute_error(y_train_rescaled['y'], y_train_rescaled['y_hat'])
df_metrics['MAE Teste'] = mean_absolute_error(y_test_rescaled['y'], y_test_rescaled['y_hat'])

df_metrics['MSE Treino'] = mean_squared_error(y_train_rescaled['y'], y_train_rescaled['y_hat'])
df_metrics['MSE Teste'] = mean_squared_error(y_test_rescaled['y'], y_test_rescaled['y_hat'])

df_metrics

Unnamed: 0,R2 Treino,R2 Teste,MAE Treino,MAE Teste,MSE Treino,MSE Teste
0,0.768341,0.700869,4055.603719,4402.059893,35541580.0,38839250.0


In [39]:
# MAE
fig = px.bar(df_metrics[['MAE Treino', 'MAE Teste']], barmode='group')
fig.update_layout(
    title={
      'text': 'Comparação do Erro Médio Absoluto de Treino e Teste',
      'x': 0.5
     })
fig.show()

In [40]:
# R2
fig = px.bar(df_metrics[['R2 Treino', 'R2 Teste']], barmode='group')
fig.update_layout(
    title={
      'text': 'Comparação entre R2 de Treino e R2 de Teste',
      'x': 0.5
     })
fig.show()

## Análise do erro

In [41]:
# Mostrar os resultados ponto a ponto
px.line(y_test_rescaled, markers=True)

In [42]:
# Histograma do erro
y_test_rescaled['erro'] = y_test_rescaled['y_hat'] - y_test_rescaled['y']
px.histogram(y_test_rescaled['erro'], nbins=50)

In [43]:
# Escalar o x de volta
X_test_rescaled = pd.DataFrame(
    data=feature_scaler.inverse_transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)
X_test_rescaled

Unnamed: 0,idade,sexo,imc,filhos,fumante,região_nordeste,região_noroeste,região_sudeste,região_sudoeste,imc*idade,imc/idade
0,47.0,0.0,27.645,2.0,1.0,0.0,1.0,0.0,0.0,1299.315,0.588191
1,18.0,0.0,35.625,0.0,0.0,1.0,0.0,0.0,0.0,641.250,1.979167
2,21.0,0.0,34.600,0.0,0.0,0.0,0.0,0.0,1.0,726.600,1.647619
3,62.0,1.0,38.830,0.0,0.0,0.0,0.0,1.0,0.0,2407.460,0.626290
4,64.0,1.0,33.880,0.0,1.0,0.0,0.0,1.0,0.0,2168.320,0.529375
...,...,...,...,...,...,...,...,...,...,...,...
397,18.0,1.0,41.140,0.0,0.0,0.0,0.0,1.0,0.0,740.520,2.285556
398,36.0,0.0,26.885,0.0,0.0,0.0,1.0,0.0,0.0,967.860,0.746806
399,18.0,1.0,33.330,0.0,0.0,0.0,0.0,1.0,0.0,599.940,1.851667
400,52.0,1.0,34.100,0.0,0.0,0.0,0.0,1.0,0.0,1773.200,0.655769


In [44]:
# Unir o X e o y novamente
df_error = pd.concat([y_test_rescaled, X_test_rescaled], axis=1)
df_error

Unnamed: 0,y,y_hat,erro,idade,sexo,imc,filhos,fumante,região_nordeste,região_noroeste,região_sudeste,região_sudoeste,imc*idade,imc/idade
0,24535.69855,34291.015247,9755.316697,47.0,0.0,27.645,2.0,1.0,0.0,1.0,0.0,0.0,1299.315,0.588191
1,2211.13075,5771.095412,3559.964662,18.0,0.0,35.625,0.0,0.0,1.0,0.0,0.0,0.0,641.250,1.979167
2,2020.17700,4676.476969,2656.299969,21.0,0.0,34.600,0.0,0.0,0.0,0.0,0.0,1.0,726.600,1.647619
3,12981.34570,16076.655843,3095.310143,62.0,1.0,38.830,0.0,0.0,0.0,0.0,1.0,0.0,2407.460,0.626290
4,46889.26120,39411.134063,-7478.127137,64.0,1.0,33.880,0.0,1.0,0.0,0.0,1.0,0.0,2168.320,0.529375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,1146.79660,6335.516047,5188.719447,18.0,1.0,41.140,0.0,0.0,0.0,0.0,1.0,0.0,740.520,2.285556
398,5267.81815,5346.672288,78.854138,36.0,0.0,26.885,0.0,0.0,0.0,1.0,0.0,0.0,967.860,0.746806
399,1135.94070,3549.507245,2413.566545,18.0,1.0,33.330,0.0,0.0,0.0,0.0,1.0,0.0,599.940,1.851667
400,9140.95100,11523.587190,2382.636190,52.0,1.0,34.100,0.0,0.0,0.0,0.0,1.0,0.0,1773.200,0.655769


In [45]:
df_error.sort_values(by='erro', ascending=False)

Unnamed: 0,y,y_hat,erro,idade,sexo,imc,filhos,fumante,região_nordeste,região_noroeste,região_sudeste,região_sudoeste,imc*idade,imc/idade
314,14455.64405,26582.202659,12126.558609,26.0,0.0,17.195,2.0,1.0,1.0,0.0,0.0,0.0,447.070,0.661346
157,12829.45510,24905.155479,12075.700379,18.0,1.0,17.290,2.0,1.0,1.0,0.0,0.0,0.0,311.220,0.960556
396,15006.57945,27053.634290,12047.054840,27.0,0.0,17.955,2.0,1.0,1.0,0.0,0.0,0.0,484.785,0.665000
223,19023.26000,30886.625810,11863.365810,39.0,0.0,18.300,5.0,1.0,0.0,0.0,0.0,1.0,713.700,0.469231
321,15820.69900,27525.422071,11704.723071,38.0,1.0,19.300,0.0,1.0,0.0,0.0,0.0,1.0,733.400,0.507895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,22493.65964,3109.372380,-19384.287260,19.0,1.0,27.265,2.0,0.0,0.0,1.0,0.0,0.0,518.035,1.435000
162,36580.28216,16586.948517,-19993.333643,61.0,0.0,33.330,4.0,0.0,0.0,0.0,1.0,0.0,2033.130,0.546393
345,21595.38229,-488.798260,-22084.180550,23.0,1.0,18.715,0.0,0.0,0.0,1.0,0.0,0.0,430.445,0.813696
360,25081.76784,1125.231151,-23956.536689,24.0,0.0,23.210,0.0,0.0,0.0,0.0,1.0,0.0,557.040,0.967083


**O que fazer para melhorar minha performance?**
+ Será que consigo criar alguma nova variável que ajude nesses piores casos?
+ Será que consigo coletar mais dados para descrever melhor esses casos?
+ Será que estou usando um modelo que explica bem os meus dados?