In [5]:
import pickle
import pathlib

import numpy as np
import pandas as pd

### Preparando os dados

In [6]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

/Users/marcelomarchetto/Desktop/ml/Ames-MM/data


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2877 entries, 0 to 2929
Data columns (total 70 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   MS.SubClass      2877 non-null   category
 1   MS.Zoning        2877 non-null   category
 2   Lot.Frontage     2877 non-null   float64 
 3   Lot.Area         2877 non-null   float64 
 4   Lot.Shape        2877 non-null   category
 5   Land.Contour     2877 non-null   category
 6   Lot.Config       2877 non-null   category
 7   Land.Slope       2877 non-null   category
 8   Neighborhood     2877 non-null   category
 9   Bldg.Type        2877 non-null   category
 10  House.Style      2877 non-null   category
 11  Overall.Qual     2877 non-null   category
 12  Overall.Cond     2877 non-null   category
 13  Roof.Style       2877 non-null   category
 14  Mas.Vnr.Type     2877 non-null   category
 15  Mas.Vnr.Area     2877 non-null   float64 
 16  Exter.Qual       2877 non-null   category


In [8]:
model_data = data.copy()

In [9]:
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

In [10]:
ordinal_columns

['Lot.Shape',
 'Land.Slope',
 'Overall.Qual',
 'Overall.Cond',
 'Exter.Qual',
 'Exter.Cond',
 'Heating.QC',
 'Electrical',
 'Kitchen.Qual',
 'Functional',
 'Paved.Drive',
 'Fence']

In [11]:
categorical_columns

['MS.SubClass',
 'MS.Zoning',
 'Land.Contour',
 'Lot.Config',
 'Neighborhood',
 'Bldg.Type',
 'House.Style',
 'Roof.Style',
 'Mas.Vnr.Type',
 'Foundation',
 'Bsmt.Qual',
 'Bsmt.Cond',
 'Bsmt.Exposure',
 'BsmtFin.Type.1',
 'BsmtFin.Type.2',
 'Central.Air',
 'Garage.Type',
 'Garage.Finish',
 'Sale.Type',
 'Sale.Condition',
 'Condition',
 'Exterior']

In [12]:
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes

In [13]:
model_data[ordinal_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2877 entries, 0 to 2929
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Lot.Shape     2877 non-null   int64
 1   Land.Slope    2877 non-null   int64
 2   Overall.Qual  2877 non-null   int64
 3   Overall.Cond  2877 non-null   int64
 4   Exter.Qual    2877 non-null   int64
 5   Exter.Cond    2877 non-null   int64
 6   Heating.QC    2877 non-null   int64
 7   Electrical    2877 non-null   int64
 8   Kitchen.Qual  2877 non-null   int64
 9   Functional    2877 non-null   int64
 10  Paved.Drive   2877 non-null   int64
 11  Fence         2877 non-null   int64
dtypes: int64(12)
memory usage: 292.2 KB


In [14]:
data['Lot.Shape'].value_counts()

Reg    1825
IR1     960
IR2      76
IR3      16
Name: Lot.Shape, dtype: int64

In [15]:
model_data['Lot.Shape'].value_counts()

0    1825
1     960
2      76
3      16
Name: Lot.Shape, dtype: int64

In [16]:
model_data['Exterior'].value_counts()

VinylSd    1024
HdBoard     439
MetalSd     432
Wd Sdng     401
Plywood     218
CemntBd     126
BrkFace      86
WdShing      55
Stucco       42
AsbShng      41
Other        13
Name: Exterior, dtype: int64

In [17]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
AsbShng,0,0,0,0,0
BrkFace,1,0,0,1,0
CemntBd,0,0,0,0,0
HdBoard,0,0,0,0,0
MetalSd,0,0,0,0,0
Plywood,0,0,0,0,0
Stucco,0,0,0,0,0
VinylSd,0,1,0,0,1
Wd Sdng,0,0,1,0,0
WdShing,0,0,0,0,0


In [18]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
BrkFace,1,0,0,1,0
CemntBd,0,0,0,0,0
HdBoard,0,0,0,0,0
MetalSd,0,0,0,0,0
Plywood,0,0,0,0,0
Stucco,0,0,0,0,0
VinylSd,0,1,0,0,1
Wd Sdng,0,0,1,0,0
WdShing,0,0,0,0,0
Other,0,0,0,0,0


In [19]:
model_test = model_data.copy()
model_data = pd.get_dummies(model_data, drop_first=True)

In [20]:
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

From column "MS.SubClass" we made "MS.SubClass_30", "MS.SubClass_50", "MS.SubClass_60", "MS.SubClass_70", "MS.SubClass_80", "MS.SubClass_85", "MS.SubClass_90", "MS.SubClass_120", "MS.SubClass_160", "MS.SubClass_190", "MS.SubClass_Other"

From column "MS.Zoning" we made "MS.Zoning_RH", "MS.Zoning_RL", "MS.Zoning_RM"

From column "Land.Contour" we made "Land.Contour_HLS", "Land.Contour_Low", "Land.Contour_Lvl"

From column "Lot.Config" we made "Lot.Config_CulDSac", "Lot.Config_FR2", "Lot.Config_FR3", "Lot.Config_Inside"

From column "Neighborhood" we made "Neighborhood_BrDale", "Neighborhood_BrkSide", "Neighborhood_ClearCr", "Neighborhood_CollgCr", "Neighborhood_Crawfor", "Neighborhood_Edwards", "Neighborhood_Gilbert", "Neighborhood_IDOTRR", "Neighborhood_MeadowV", "Neighborhood_Mitchel", "Neighborhood_NAmes", "Neighborhood_NPkVill", "Neighborhood_NWAmes", "Neighborhood_NoRidge", "Neighborhood_NridgHt", "Neighborhood_OldTown", "Neighborhood_SWISU", "Neighborhood_Sawyer", "Neighborhood_Sa

# Removendo a feature target

In [21]:
X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

## Separando os dados em treino e teste

In [22]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)


Iniciamos a nossa análise testando o modelo de regressão linear para podermos comparar com os outros posteriormente.

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()

model.fit(Xtrain, ytrain)

ypred = model.predict(Xtest)

RMSE = np.sqrt(mean_squared_error(ytest, ypred))

error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 15.11%


# Usando GridSearchCV para encontrar os melhores parâmetros para 3 modelos diferentes: Ridge, Lasso e ElasticNet.

In [26]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42  # Definindo uma semente aleatória

def grid_search_regression(model, param_grid, X, y):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    best_score = np.sqrt(-grid_search.best_score_)
    return grid_search.best_estimator_, best_params, best_score


# Alphas para serem testados
alphas = np.logspace(-6, 6, 13)

### Usando Grid Search para encontrar os melhores parâmetros para o modelo de `Ridge`.

In [27]:
# Primeira busca em grade com uma faixa mais ampla de alphas

print(f'alphas: {alphas}')
param_grid_ridge = {'alpha': alphas}
model_ridge, best_ridge_params, best_ridge_score = grid_search_regression(Ridge(random_state=RANDOM_SEED), param_grid_ridge, Xtrain, ytrain)

# Refinamento da busca em grade com uma faixa mais estreita de alphas
alphas_fine = np.linspace(best_ridge_params['alpha']*0.5, best_ridge_params['alpha']*1.5, 20)
param_grid_ridge_fine = {'alpha': alphas_fine}
model_ridge, best_ridge_params_fine, best_ridge_score_fine = grid_search_regression(Ridge(random_state=RANDOM_SEED), param_grid_ridge_fine, Xtrain, ytrain)

print(f"Best parameters for Ridge: {best_ridge_params_fine}, Best RMSE score: {best_ridge_score_fine}")

alphas: [1.e-06 1.e-05 1.e-04 1.e-03 1.e-02 1.e-01 1.e+00 1.e+01 1.e+02 1.e+03
 1.e+04 1.e+05 1.e+06]
Best parameters for Ridge: {'alpha': 5.526315789473684}, Best RMSE score: 0.05756291176968964


### Usando Grid Search para encontrar os melhores parâmetros para o modelo de `Lasso`.

In [29]:
param_grid_lasso = {'alpha': alphas}
model_lasso, best_lasso_params, best_lasso_score = grid_search_regression(Lasso(random_state=RANDOM_SEED), param_grid_lasso, Xtrain, ytrain)

alphas_fine = np.linspace(best_lasso_params['alpha']*0.5, best_lasso_params['alpha']*1.5, 20)
param_grid_lasso_fine = {'alpha': alphas_fine}
model_lasso, best_lasso_params_fine, best_lasso_score_fine = grid_search_regression(Lasso(random_state=RANDOM_SEED), param_grid_lasso_fine, Xtrain, ytrain)

print(f"Best parameters for Lasso: {best_lasso_params_fine}, Best RMSE score: {best_lasso_score_fine}")

Best parameters for Lasso: {'alpha': 7.105263157894737e-05}, Best RMSE score: 0.057576420884465324


### Usando Grid Search para encontrar os melhores parâmetros para o modelo de `ElasticNet`.

In [30]:
param_grid_elastic_net = {'alpha': alphas, 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
model_en, best_elastic_net_params, best_elastic_net_score = grid_search_regression(ElasticNet(random_state=RANDOM_SEED), param_grid_elastic_net, Xtrain, ytrain)

alphas_fine = np.linspace(best_elastic_net_params['alpha']*0.5, best_elastic_net_params['alpha']*1.5, 20)
param_grid_elastic_net_fine = {'alpha': alphas_fine, 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
model_en, best_elastic_net_params_fine, best_elastic_net_score_fine = grid_search_regression(ElasticNet(random_state=RANDOM_SEED), param_grid_elastic_net_fine, Xtrain, ytrain)


print(f"Best parameters for Elastic Net: {best_elastic_net_params_fine}, Best RMSE score: {best_elastic_net_score_fine}")

Best parameters for Elastic Net: {'alpha': 0.00013947368421052633, 'l1_ratio': 0.5}, Best RMSE score: 0.05757196728640977


In [38]:
error_percent = 100 * (10**best_ridge_score - 1)
print(f'Average error (Ridge) is {error_percent:.2f}%')
error_percent = 100 * (10**best_ridge_score_fine - 1)
print(f'Average error (Ridge with fine tuning) is {error_percent:.2f}%')
print("---------------------------------------------------------------")

error_percent = 100 * (10**best_lasso_score - 1)
print(f'Average error (Lasso) is {error_percent:.2f}%')
error_percent = 100 * (10**best_lasso_score_fine - 1)
print(f'Average error (Lasso with fine tuning) is {error_percent:.2f}%')
print("---------------------------------------------------------------")

error_percent = 100 * (10**best_elastic_net_score - 1)
print(f'Average error (Elastic Net) is {error_percent:.2f}%')
error_percent = 100 * (10**best_elastic_net_score_fine - 1)
print(f'Average error (Elastic Net with fine tuning) is {error_percent:.2f}%')

Average error (Ridge) is 14.19%
Average error (Ridge with fine tuning) is 14.17%
---------------------------------------------------------------
Average error (Lasso) is 14.19%
Average error (Lasso with fine tuning) is 14.18%
---------------------------------------------------------------
Average error (Elastic Net) is 14.18%
Average error (Elastic Net with fine tuning) is 14.18%


### Testando os modelos com os parâmetros encontrados pelo GridSearchCV no conjunto de teste.

In [39]:
ypred_ridge = model_ridge.predict(Xtest)
ypred_lasso = model_lasso.predict(Xtest)
ypred_en = model_en.predict(Xtest)

RMSE_ridge = np.sqrt(mean_squared_error(ytest, ypred_ridge))
RMSE_lasso = np.sqrt(mean_squared_error(ytest, ypred_lasso))    
RMSE_en = np.sqrt(mean_squared_error(ytest, ypred_en))

error_percent_ridge = 100 * (10**RMSE_ridge - 1)
error_percent_lasso = 100 * (10**RMSE_lasso - 1)
error_percent_en = 100 * (10**RMSE_en - 1)

print(f'Average error (Ridge) is {error_percent_ridge:.2f}%')
print(f'Average error (Lasso) is {error_percent_lasso:.2f}%')
print(f'Average error (Elastic Net) is {error_percent_en:.2f}%')

Average error (Ridge) is 15.21%
Average error (Lasso) is 15.31%
Average error (Elastic Net) is 15.31%


# Comparando os resultados de cada modelo

In [44]:
# make a ttest to see if the difference between the two models is significant
from scipy.stats import ttest_rel

ypred_ridge = model_ridge.predict(Xtest)
ypred_lasso = model_lasso.predict(Xtest)

pvalue = ttest_rel(ypred_ridge, ypred_lasso).pvalue

if pvalue < 0.05:
    print(f'A diferença entre os modelos Ridge e Lasso é significativa, pvalue: {pvalue*100:.2f}%')
else:
    print(f'A diferença entre os modelos Ridge e Lasso não é significativa, pvalue: {pvalue*100:.2f}%')

A diferença entre os modelos Ridge e Lasso não é significativa, pvalue: 43.89%


In [45]:

# make a ttest to see if the difference between the two models is significant
from scipy.stats import ttest_rel

ypred_ridge = model_ridge.predict(Xtest)
ypred_lasso = model_en.predict(Xtest)

pvalue = ttest_rel(ypred_ridge, ypred_lasso).pvalue

if pvalue < 0.05:
    print(f'A diferença entre os modelos Ridge e Elastic net é significativa, pvalue: {pvalue*100:.2f}%')
else:
    print(f'A diferença entre os modelos Ridge e Elastic net não é significativa, pvalue: {pvalue*100:.2f}%')

A diferença entre os modelos Ridge e Elastic net não é significativa, pvalue: 41.98%


In [46]:

# make a ttest to see if the difference between the two models is significant
from scipy.stats import ttest_rel

ypred_ridge = model_lasso.predict(Xtest)
ypred_lasso = model_en.predict(Xtest)

pvalue = ttest_rel(ypred_ridge, ypred_lasso).pvalue

if pvalue < 0.05:
    print(f'A diferença entre os modelos Lasso e Elastic net é significativa, pvalue: {pvalue*100:.2f}%')
else:
    print(f'A diferença entre os modelos Lasso e Elastic net não é significativa, pvalue: {pvalue*100:.2f}%')

A diferença entre os modelos Lasso e Elastic net não é significativa, pvalue: 45.93%
