# Exercicio Final

This exercise will test your ability to read a data file and understand statistics about the data.


## Passo 1: Carregando os dados e as bibliotecas


In [1]:
# Importando bibliotecas que serão utilizadas
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
# Carregando o dataset
dataset = '../HousingPricesCompetitionForKaggleLearnUsers/Data/train.csv'
home_data = pd.read_csv(dataset)

## Passo 2: Visualizando e explorando os dados

In [3]:
# Visualizando as primeiras linhas do dataset
home_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Estatísticas descritivas
home_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
# Tamanho médio do Lote
avg_lot_size = round(home_data['LotArea'].mean())
print('O tamanho médio do lote é: ', avg_lot_size)

# Quantos anos tem a construção mais recente?
newest_home_age = (2022 - (home_data['YearBuilt'].max()))
print('A contrução mais recente tem {} anos '.format(newest_home_age))

# Quantos anos tem a construção mais recente?
older_home_age = (2022 - (home_data['YearBuilt'].min()))
print('A contrução mais antiga tem {} anos '.format(older_home_age))


O tamanho médio do lote é:  10517
A contrução mais recente tem 12 anos 
A contrução mais antiga tem 150 anos 


## Passo 3: Especificando Prediction Target

In [6]:
# Como variável de meta de previsão, será utilizada SalePrice que contém dados de valor de venda
y = home_data.SalePrice

# Exibindo uma amostra de 5 dados armazenados na variável y
y.sample(5)

1371    165500
455     175500
545     229000
1185    104900
782     187100
Name: SalePrice, dtype: int64

## Passo 4: Features: Criando recursos preditivos

In [7]:
# Visualizando o nome de todas as colunas do dataset
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [8]:
# Criando uma lista com o nome das variáveis de interesse e armazenando em uma variável chamada Features
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

In [9]:
# Selecionando as colunas de interesse no dataset e armazenando em uma variável chamada X
X = home_data[features]

# Visualizando uma amostra com 5 registros
X.sample(5)

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
158,12552,2004,991,956,2,3,8
1343,7558,1928,1172,741,1,3,9
1444,8500,2004,1422,0,2,3,7
1237,12393,2004,847,1101,2,4,8
1451,9262,2008,1578,0,2,3,7


## Passo 5: Especificando e Treinando o Modelo

In [10]:
# Dividido em dados de validação e treinamento
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [11]:
# Defindo o modelo: Randon Forest
rf_model = RandomForestRegressor(random_state=1)

In [12]:
# Treinando o modelo
rf_model.fit(train_X, train_y)

RandomForestRegressor(random_state=1)

## Passo 6: Validando o Modelo

In [13]:
# Fazendo previsão
rf_val_predictions = rf_model.predict(val_X)

# Visualizando algumas previsões
rf_val_predictions[:5]

array([187439.65, 149083.25, 129767.58,  85257.  , 149803.09])

In [14]:
# Média de erro absoluto

rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validação do MAE para Random Forest Model: {:,.0f}".format(rf_val_mae))


Validação do MAE para Random Forest Model: 21,857


## Passo 7: Melhorando o modelo

In [15]:
# Definindo o modelo em todos os dados de treinamento

rf_model_on_full_data = RandomForestRegressor(random_state=1)

In [16]:
# Treinando o modelo em todos os dados de treinamento
rf_model_on_full_data.fit(X,y)

RandomForestRegressor(random_state=1)

## Passo 8: Fazendo previsões

In [17]:
# caminho do arquivo que será usado para previções
test_data_path = '../HousingPricesCompetitionForKaggleLearnUsers/Data/test.csv'

In [18]:
# Lendo o dataset de teste 
test_data = pd.read_csv(test_data_path)

In [19]:
# cria test_X que vem de test_data, mas inclui apenas as colunas usadas para previsão.
# A lista de colunas é armazenada em uma variável chamada features
test_X = test_data[features]

In [20]:
# Fazendo previsões
test_preds = rf_model_on_full_data.predict(test_X)

In [21]:
# visaualizando amostra das previsões
test_preds[:5]

array([122656.58, 156789.  , 182959.  , 178102.  , 189049.48])

In [22]:
# Incluindo nova coluna do datasat com os preços de venda previstos
test_data['SalesPriceForecast'] = test_preds

In [23]:
# Visualizando amostra
test_data[['Id', 'LotArea', 'YearBuilt', 'SalesPriceForecast']].sample(5)

Unnamed: 0,Id,LotArea,YearBuilt,SalesPriceForecast
1204,2665,7740,2006,283155.03
989,2450,8398,1910,128459.04
1037,2498,5400,1958,115322.32
230,1691,8174,2003,175869.12
121,1582,7480,1972,121651.0
