In [1]:
import pickle
import pathlib

import numpy as np
import pandas as pd

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

c:\Users\brizz\OneDrive\Documentos\GitHub\Ames-MM\data


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2854 entries, 0 to 2929
Data columns (total 70 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   MS.SubClass      2854 non-null   category
 1   MS.Zoning        2854 non-null   category
 2   Lot.Frontage     2854 non-null   float64 
 3   Lot.Area         2854 non-null   float64 
 4   Lot.Shape        2854 non-null   category
 5   Land.Contour     2854 non-null   category
 6   Lot.Config       2854 non-null   category
 7   Land.Slope       2854 non-null   category
 8   Neighborhood     2854 non-null   category
 9   Bldg.Type        2854 non-null   category
 10  House.Style      2854 non-null   category
 11  Overall.Qual     2854 non-null   category
 12  Overall.Cond     2854 non-null   category
 13  Roof.Style       2854 non-null   category
 14  Mas.Vnr.Type     2854 non-null   category
 15  Mas.Vnr.Area     2854 non-null   float64 
 16  Exter.Qual       2854 non-null   category


In [4]:
model_data = data.copy()

In [5]:
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

In [6]:
ordinal_columns

['Lot.Shape',
 'Land.Slope',
 'Overall.Qual',
 'Overall.Cond',
 'Exter.Qual',
 'Exter.Cond',
 'Heating.QC',
 'Electrical',
 'Kitchen.Qual',
 'Functional',
 'Paved.Drive',
 'Fence']

In [7]:
categorical_columns

['MS.SubClass',
 'MS.Zoning',
 'Land.Contour',
 'Lot.Config',
 'Neighborhood',
 'Bldg.Type',
 'House.Style',
 'Roof.Style',
 'Mas.Vnr.Type',
 'Foundation',
 'Bsmt.Qual',
 'Bsmt.Cond',
 'Bsmt.Exposure',
 'BsmtFin.Type.1',
 'BsmtFin.Type.2',
 'Central.Air',
 'Garage.Type',
 'Garage.Finish',
 'Sale.Type',
 'Sale.Condition',
 'Condition',
 'Exterior']

In [8]:
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes

In [9]:
model_data[ordinal_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2854 entries, 0 to 2929
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Lot.Shape     2854 non-null   int64
 1   Land.Slope    2854 non-null   int64
 2   Overall.Qual  2854 non-null   int64
 3   Overall.Cond  2854 non-null   int64
 4   Exter.Qual    2854 non-null   int64
 5   Exter.Cond    2854 non-null   int64
 6   Heating.QC    2854 non-null   int64
 7   Electrical    2854 non-null   int64
 8   Kitchen.Qual  2854 non-null   int64
 9   Functional    2854 non-null   int64
 10  Paved.Drive   2854 non-null   int64
 11  Fence         2854 non-null   int64
dtypes: int64(12)
memory usage: 289.9 KB


In [10]:
data['Lot.Shape'].value_counts()

Reg    1813
IR1     950
IR2      75
IR3      16
Name: Lot.Shape, dtype: int64

In [11]:
model_data['Lot.Shape'].value_counts()

0    1813
1     950
2      75
3      16
Name: Lot.Shape, dtype: int64

In [12]:
model_data['Exterior'].value_counts()

VinylSd    1006
HdBoard     439
MetalSd     432
Wd Sdng     400
Plywood     218
CemntBd     123
BrkFace      86
WdShing      54
Stucco       42
AsbShng      41
Other        13
Name: Exterior, dtype: int64

In [13]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
AsbShng,0,0,0,0,0
BrkFace,1,0,0,1,0
CemntBd,0,0,0,0,0
HdBoard,0,0,0,0,0
MetalSd,0,0,0,0,0
Plywood,0,0,0,0,0
Stucco,0,0,0,0,0
VinylSd,0,1,0,0,1
Wd Sdng,0,0,1,0,0
WdShing,0,0,0,0,0


In [14]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
BrkFace,1,0,0,1,0
CemntBd,0,0,0,0,0
HdBoard,0,0,0,0,0
MetalSd,0,0,0,0,0
Plywood,0,0,0,0,0
Stucco,0,0,0,0,0
VinylSd,0,1,0,0,1
Wd Sdng,0,0,1,0,0
WdShing,0,0,0,0,0
Other,0,0,0,0,0


In [15]:
model_test = model_data.copy()
model_data = pd.get_dummies(model_data, drop_first=True)

In [16]:
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

From column "MS.SubClass" we made "MS.SubClass_30", "MS.SubClass_50", "MS.SubClass_60", "MS.SubClass_70", "MS.SubClass_80", "MS.SubClass_85", "MS.SubClass_90", "MS.SubClass_120", "MS.SubClass_160", "MS.SubClass_190", "MS.SubClass_Other"

From column "MS.Zoning" we made "MS.Zoning_RH", "MS.Zoning_RL", "MS.Zoning_RM"

From column "Land.Contour" we made "Land.Contour_HLS", "Land.Contour_Low", "Land.Contour_Lvl"

From column "Lot.Config" we made "Lot.Config_CulDSac", "Lot.Config_FR2", "Lot.Config_FR3", "Lot.Config_Inside"

From column "Neighborhood" we made "Neighborhood_BrDale", "Neighborhood_BrkSide", "Neighborhood_ClearCr", "Neighborhood_CollgCr", "Neighborhood_Crawfor", "Neighborhood_Edwards", "Neighborhood_Gilbert", "Neighborhood_IDOTRR", "Neighborhood_MeadowV", "Neighborhood_Mitchel", "Neighborhood_NAmes", "Neighborhood_NPkVill", "Neighborhood_NWAmes", "Neighborhood_NoRidge", "Neighborhood_NridgHt", "Neighborhood_OldTown", "Neighborhood_SWISU", "Neighborhood_Sawyer", "Neighborhood_Sa

In [17]:
X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

In [18]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)


In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()

model.fit(Xtrain, ytrain)

ypred = model.predict(Xtest)

RMSE = np.sqrt(mean_squared_error(ytest, ypred))

error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 18.53%


In [20]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42  # Definindo uma semente aleatória

def grid_search_regression(model, param_grid, X, y):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    best_score = np.sqrt(-grid_search.best_score_)
    return grid_search.best_estimator_, best_params, best_score

# Primeira busca em grade com uma faixa mais ampla de alphas
alphas = np.logspace(-6, 6, 13)
print(f'alphas: {alphas}')
param_grid_ridge = {'alpha': alphas}
model_ridge, best_ridge_params, best_ridge_score = grid_search_regression(Ridge(random_state=RANDOM_SEED), param_grid_ridge, Xtrain, ytrain)

# Refinamento da busca em grade com uma faixa mais estreita de alphas
alphas_fine = np.linspace(best_ridge_params['alpha']*0.5, best_ridge_params['alpha']*1.5, 20)
param_grid_ridge_fine = {'alpha': alphas_fine}
model_ridge, best_ridge_params_fine, best_ridge_score_fine = grid_search_regression(Ridge(random_state=RANDOM_SEED), param_grid_ridge_fine, Xtrain, ytrain)

param_grid_lasso = {'alpha': alphas}
model_lasso, best_lasso_params, best_lasso_score = grid_search_regression(Lasso(random_state=RANDOM_SEED), param_grid_lasso, Xtrain, ytrain)

alphas_fine = np.linspace(best_lasso_params['alpha']*0.5, best_lasso_params['alpha']*1.5, 20)
param_grid_lasso_fine = {'alpha': alphas_fine}
model_lasso, best_lasso_params_fine, best_lasso_score_fine = grid_search_regression(Lasso(random_state=RANDOM_SEED), param_grid_lasso_fine, Xtrain, ytrain)

param_grid_elastic_net = {'alpha': alphas, 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
model_en, best_elastic_net_params, best_elastic_net_score = grid_search_regression(ElasticNet(random_state=RANDOM_SEED), param_grid_elastic_net, Xtrain, ytrain)

alphas_fine = np.linspace(best_elastic_net_params['alpha']*0.5, best_elastic_net_params['alpha']*1.5, 20)
param_grid_elastic_net_fine = {'alpha': alphas_fine, 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
model_en, best_elastic_net_params_fine, best_elastic_net_score_fine = grid_search_regression(ElasticNet(random_state=RANDOM_SEED), param_grid_elastic_net_fine, Xtrain, ytrain)

print(f"Best parameters for Ridge: {best_ridge_params_fine}, Best RMSE score: {best_ridge_score_fine}")
print(f"Best parameters for Lasso: {best_lasso_params_fine}, Best RMSE score: {best_lasso_score_fine}")
print(f"Best parameters for Elastic Net: {best_elastic_net_params_fine}, Best RMSE score: {best_elastic_net_score_fine}")


alphas: [1.e-06 1.e-05 1.e-04 1.e-03 1.e-02 1.e-01 1.e+00 1.e+01 1.e+02 1.e+03
 1.e+04 1.e+05 1.e+06]
Best parameters for Ridge: {'alpha': 9.210526315789473}, Best RMSE score: 0.0474038894847182
Best parameters for Lasso: {'alpha': 9.210526315789474e-05}, Best RMSE score: 0.04713527977453754
Best parameters for Elastic Net: {'alpha': 0.00012894736842105264, 'l1_ratio': 0.7}, Best RMSE score: 0.047135437573023575


In [21]:
error_percent = 100 * (10**best_ridge_score - 1)
print(f'Average error (Ridge) is {error_percent:.2f}%')
error_percent = 100 * (10**best_ridge_score_fine - 1)
print(f'Average error (Ridge with fine tuning) is {error_percent:.2f}%')
error_percent = 100 * (10**best_lasso_score - 1)
print(f'Average error (Lasso) is {error_percent:.2f}%')
error_percent = 100 * (10**best_lasso_score_fine - 1)
print(f'Average error (Lasso with fine tuning) is {error_percent:.2f}%')
error_percent = 100 * (10**best_elastic_net_score - 1)
print(f'Average error (Elastic Net) is {error_percent:.2f}%')
error_percent = 100 * (10**best_elastic_net_score_fine - 1)
print(f'Average error (Elastic Net with fine tuning) is {error_percent:.2f}%')

Average error (Ridge) is 11.53%
Average error (Ridge with fine tuning) is 11.53%
Average error (Lasso) is 11.47%
Average error (Lasso with fine tuning) is 11.46%
Average error (Elastic Net) is 11.46%
Average error (Elastic Net with fine tuning) is 11.46%


In [22]:
model = model_lasso

ypred = model.predict(Xtest)

RMSE = np.sqrt(mean_squared_error(ytest, ypred))

error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 18.76%
