<a href="https://colab.research.google.com/github/luanapetrolli/luanapetrolli/blob/main/HousePricesLuanaPetrolli.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**House Prices - Advanced Regression Techniques**
From Kaggle (https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques)

##Competition Description
Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

##Evaluation
###Goal

It is your job to predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable.

Metric
Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

In [14]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'house-prices-advanced-regression-techniques:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F5407%2F868283%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240409%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240409T141045Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dac6c51d3a499175f845acb8700ad2645cd6f2e6951ee68be53e95c3f24dec04fcf3cafbf1b6e1e9eb9018fa29dcdd8017adbaf3daf85ebb4435032c5e5fad4fea62c8e0ec103dbc7a6d5983eb57ca42c011a68e77640c0d9ba7e96285ae75734c5e5921be17e023ad2064f94c60d12144a3a21e3dd2b801cb123d0ba48ef97a608dee3f9a05d678f47ffa08e7b9c19261de1b4d64dcd9a3b150554cf5ec624f684cda9d997b70f26ba31f12bcd73c531fef97d1faab821bd005661c5fb77f2a01c50627b2c7d05fa8f56b365436bf97d3f9d8541e56822d4ebbc4111fe149ede1df00f425a3c342de15a72e252eecfba02b43e3167f909dbf4585cc23ff48c28'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading house-prices-advanced-regression-techniques, 203809 bytes compressed
Downloaded and uncompressed: house-prices-advanced-regression-techniques
Data source import complete.


In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Load the dataset

In [13]:
df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
df_train.shape

(1460, 81)

In [5]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [15]:
df_train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


#Variable mapping

In [22]:
df_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [16]:
cat_vars = df_train.select_dtypes(include = "object").columns.to_list()
target = "SalePrice"
num_vars = [col for col in df_train.columns if col not in cat_vars + ["Id",target]]
features = cat_vars + num_vars

In [17]:
X = df_train.filter(features)
y = df_train[target]

In [36]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

#Prepare the dataset

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=42)

#Data pipeline


In [20]:
!pip install feature-engine



In [25]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from sklearn.preprocessing import StandardScaler
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import OneHotEncoder

In [26]:
steps_modelos_lineares = [
    ('numeric_imputer', MeanMedianImputer(variables=num_vars, imputation_method='median')),
    ('numeric_scaler', SklearnTransformerWrapper(variables=num_vars, transformer=StandardScaler())),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='NotAv')),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
]

steps_modelos_nao_lineares = [
    ('numeric_imputer', MeanMedianImputer(variables=num_vars, imputation_method='median')),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='NotAv')),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),

]


#Training multiple models

In [27]:
!pip install lightgbm xgboost catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [29]:
random_state = 42

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

modelos_lineares = [
    ('linear_regression', LinearRegression()),
    ('sgdr', SGDRegressor(random_state=random_state)),
]

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

modelos_nao_lineares = [
    ('decision_tree', DecisionTreeRegressor(random_state=random_state)),
    ('random_forest', RandomForestRegressor(random_state=random_state)),
    ('xgb', XGBRegressor(random_state=random_state)),
    ('lgbm', LGBMRegressor(random_state=random_state)),
    ('catboost', CatBoostRegressor(random_state=random_state, verbose=0))
]


In [30]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

df_resultados = pd.DataFrame(columns=['algoritmo', 'base', 'rmse', 'mae', 'mape'])
df_resultados

Unnamed: 0,algoritmo,base,rmse,mae,mape


In [31]:
def treinar_modelo(model_name, model, steps, X_train, y_train, X_test, y_test, random_state):
    pipeline = Pipeline(steps=steps + [(model_name, model)])
    pipeline.fit(X_train, y_train)

    res_treino = [
        mean_squared_error(y_train, pipeline.predict(X_train), squared=False),
        mean_absolute_error(y_train, pipeline.predict(X_train)),
        mean_absolute_percentage_error(y_train, pipeline.predict(X_train)),
    ]
    res_teste = [
        mean_squared_error(y_test, pipeline.predict(X_test), squared=False),
        mean_absolute_error(y_test, pipeline.predict(X_test)),
        mean_absolute_percentage_error(y_test, pipeline.predict(X_test)),
    ]

    return res_treino, res_teste

In [32]:
pd.options.display.float_format = '{:.4f}'.format
import warnings
warnings.filterwarnings('ignore')

In [33]:
for model_name, model in modelos_lineares:
    print(f'Treinando {model_name} ...', end=' ')
    res_treino, res_teste = treinar_modelo(model_name, model, steps_modelos_lineares, X_train, y_train, X_test, y_test, random_state)
    df_resultados.loc[len(df_resultados)] = [model_name, 'treino'] + res_treino
    df_resultados.loc[len(df_resultados)] = [model_name, 'teste'] + res_teste
    print('OK')

for model_name, model in modelos_nao_lineares:
    print(f'Treinando {model_name} ...', end=' ')
    res_treino, res_teste = treinar_modelo(model_name, model, steps_modelos_nao_lineares, X_train, y_train, X_test, y_test, random_state)
    df_resultados.loc[len(df_resultados)] = [model_name, 'treino'] + res_treino
    df_resultados.loc[len(df_resultados)] = [model_name, 'teste'] + res_teste
    print('OK')

Treinando linear_regression ... OK
Treinando sgdr ... OK
Treinando decision_tree ... OK
Treinando random_forest ... OK
Treinando xgb ... OK
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 193
[LightGBM] [Info] Start training from score 181441.541952
OK
Treinando catboost ... OK


In [34]:
df_resultados[df_resultados.base == 'treino'].sort_values(['mae'])

Unnamed: 0,algoritmo,base,rmse,mae,mape
4,decision_tree,treino,0.0,0.0,0.0
8,xgb,treino,1448.2704,1017.1195,0.0066
12,catboost,treino,5666.8276,4346.8895,0.0276
10,lgbm,treino,11344.9437,5290.7161,0.03
6,random_forest,treino,11110.5273,6550.1167,0.0384
0,linear_regression,treino,18904.1402,12112.6875,0.0714
2,sgdr,treino,27677.8578,17555.5765,0.1034


In [37]:
df_resultados[df_resultados.base == 'teste'].sort_values(['mae'])

Unnamed: 0,algoritmo,base,rmse,mae,mape
13,catboost,teste,25987.0843,15453.8447,0.0909
9,xgb,teste,25591.5383,16250.0321,0.0995
11,lgbm,teste,28800.9957,16963.7766,0.1012
7,random_forest,teste,29041.4466,17636.6025,0.1076
3,sgdr,teste,33877.9753,20660.0472,0.1251
5,decision_tree,teste,42991.4199,27211.0308,0.1587
1,linear_regression,teste,137153491959935.7,13531424135763.688,76023244.4893


#Cross Validation

In [38]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


kf = KFold(n_splits=5, shuffle=True, random_state=42)

xgb_pipeline = Pipeline(steps=steps_modelos_nao_lineares + [('xgb', XGBRegressor(random_state=42))])
cv_results_rf = cross_val_score(estimator=xgb_pipeline, scoring='neg_mean_absolute_error', X=X_train, y=y_train, cv=kf, n_jobs=-1)
print('Mean CV XGB', cv_results_rf.mean())
print('Std CV XGB', cv_results_rf.std())

print()

catboost_pipeline = Pipeline(steps=steps_modelos_nao_lineares + [('catboost', CatBoostRegressor(random_state=42))])
cv_results_catboost = cross_val_score(estimator=catboost_pipeline, scoring='neg_mean_absolute_error', X=X_train, y=y_train, cv=kf, n_jobs=-1)
print('Mean CV Catboost', cv_results_catboost.mean())
print('Std CV Catboost', cv_results_catboost.std())

Mean CV XGB -17971.95122452003
Std CV XGB 1014.6580906623914

Mean CV Catboost -14795.242965365349
Std CV Catboost 990.1829439925004
