In [28]:
#Todos os imports necessários
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy import stats
from sklearn.ensemble import RandomForestRegressor


In [29]:
# Importando o dataset de treino
base = pd.read_csv('train.csv')

In [30]:
# Eliminando colunas com mais de 10% de valores vazios
eliminar = base.columns[(base.isnull().sum() / base.shape[0]) > 0.1]
base = base.drop(eliminar, axis=1)

# Selecionando apenas as colunas numéricas
colunas = base.columns[base.dtypes != 'object']
base2 = base.loc[:, colunas]
base2

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,7917,6,5,1999,2000,0.0,0,0,...,0,40,0,0,0,0,0,8,2007,175000
1456,1457,20,13175,6,6,1978,1988,119.0,790,163,...,349,0,0,0,0,0,0,2,2010,210000
1457,1458,70,9042,7,9,1941,2006,0.0,275,0,...,0,60,0,0,0,0,2500,5,2010,266500
1458,1459,20,9717,5,6,1950,1996,0.0,49,1029,...,366,0,112,0,0,0,0,4,2010,142125


In [31]:
# Substituindo os valores vazios por -1
base2 = base2.fillna(-1)
base2

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,7917,6,5,1999,2000,0.0,0,0,...,0,40,0,0,0,0,0,8,2007,175000
1456,1457,20,13175,6,6,1978,1988,119.0,790,163,...,349,0,0,0,0,0,0,2,2010,210000
1457,1458,70,9042,7,9,1941,2006,0.0,275,0,...,0,60,0,0,0,0,2500,5,2010,266500
1458,1459,20,9717,5,6,1950,1996,0.0,49,1029,...,366,0,112,0,0,0,0,4,2010,142125


In [32]:
# Removendo outliers usando Z-score
z_scores = np.abs(stats.zscore(base2))
threshold = 3
base2 = base2[(z_scores < threshold).all(axis=1)]

In [39]:
# Criando novas features
base2['TotalSF'] = base2['TotalBsmtSF'] + base2['1stFlrSF'] + base2['2ndFlrSF']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base2['TotalSF'] = base2['TotalBsmtSF'] + base2['1stFlrSF'] + base2['2ndFlrSF']


In [34]:
# Selecionando X e y
X = base2.drop(['SalePrice'], axis=1)
y = base2.SalePrice

# Dividindo os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [35]:
# Padronizando os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
# Usando um modelo de Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Avaliando o erro da regressão
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')

Mean Squared Error: 434883222.7210818


In [37]:
# Validação cruzada
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f'Cross-Validated MSE: {-cv_scores.mean()}')

Cross-Validated MSE: 492002832.29014283


In [38]:
# Importando a base de teste
teste = pd.read_csv('test.csv')

# Eliminando as mesmas colunas da base de treino
teste = teste.drop(eliminar, axis=1)

# Selecionando apenas as colunas numéricas
colunas2 = teste.columns[teste.dtypes != 'object']
teste = teste.loc[:, colunas2]

# Substituindo os valores vazios por -1
teste = teste.fillna(-1)

# Criando novas features para o teste
teste['TotalSF'] = teste['TotalBsmtSF'] + teste['1stFlrSF'] + teste['2ndFlrSF']

# Padronizando a base de teste
teste_scaled = scaler.transform(teste)

# Fazendo a previsão para a base de teste
y_pred_test = model.predict(teste_scaled)

# Adicionando a coluna de previsão na base de teste
teste['SalePrice'] = y_pred_test

# Extraindo apenas as colunas Id e SalePrice
resultado = teste[['Id', 'SalePrice']]
print(resultado.head(3))
# Podemos então exportar essa base
resultado.to_csv('resultado.csv', index=False)

     Id  SalePrice
0  1461  129756.50
1  1462  149308.37
2  1463  187954.35
