In [9]:
#import necessary packages and functions
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
#load train and test data
train_data = pd.read_csv('C:/Users/lukec/Downloads/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('C:/Users/lukec/Downloads/house-prices-advanced-regression-techniques/test.csv')

In [6]:
#output first few rows of train data
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
#output first few rows of test data
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [11]:
#define list of features to be considered by model
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'TotRmsAbvGrd']

#change data to include only features that will be considered
X = train_data[features]
y = train_data['SalePrice']

X_test = test_data[features]

#split train data into train/validate data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 1)

In [12]:
#define object for Simple Imputer
data_imputer = SimpleImputer()

#impute values for train and val data
X_train = pd.DataFrame(data_imputer.fit_transform(X_train))
X_valid = pd.DataFrame(data_imputer.transform(X_valid))

In [20]:
#define model to be used
model = RandomForestRegressor(n_estimators = 120, random_state = 1)

#fit model to train data
model.fit(X_train, y_train)

#check accuracy of model on validation data using mean_absolute_error
val_preds = model.predict(X_valid)
print("The MAE for the validation data is:")
print(mean_absolute_error(val_preds, y_valid))

The MAE for the validation data is:
22677.58986138291


In [21]:
#now train the model for the entire train data, to improve accuracy
model.fit(X, y)

#preprocess data in the same way as the train data
X_test = pd.DataFrame(data_imputer.transform(X_test))

#make predictions for test data
test_preds = model.predict(X_test)

In [22]:
#output data to csv file for competition submission
output = pd.DataFrame({'ID' : test_data.Id, 'SalePrice' : test_preds})
output.to_csv('submission.csv', index = False)