In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Import training and test data
X_full = pd.read_csv("housing_prices_competition/train.csv", index_col='Id')
X_test_full = pd.read_csv("housing_prices_competition/test.csv", )

In [22]:
# Obtain target and predictor variables
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

In [4]:
# Create a validation set for the training data
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=33)

In [6]:
x_train.head(10)

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
821,6000,1953,936,0,1,2,4
856,10970,1978,1026,0,1,3,5
849,9600,1976,1094,761,2,3,7
854,17920,1955,1779,0,1,3,6
649,1936,1970,630,0,1,1,3
318,9900,1993,1372,1274,2,4,9
1363,8499,2006,616,796,2,3,6
176,10029,1988,1164,896,2,4,8
16,11241,1970,1004,0,1,2,5
1000,10206,1952,944,0,1,2,4


In [9]:
# Define Random Forest models

from sklearn.ensemble import RandomForestRegressor

model_1 = RandomForestRegressor(n_estimators=50, random_state=33)
model_2 = RandomForestRegressor(n_estimators=100, random_state=33)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=33)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=33)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=33)

models = [model_1, model_2, model_3, model_4, model_5]

In [12]:
# Set up scoring of resulting models for comparison
from sklearn.metrics import mean_absolute_error

def score_model(model, x_t=x_train, x_v=x_test, y_t=y_train, y_v=y_test):
    model.fit(x_t, y_t)
    return mean_absolute_error(y_v, model.predict(x_v))


In [13]:
for i in range(len(models)):
    print(f"Model {i+1} has MAE equal to {score_model(models[i])}")

Model 1 has MAE equal to 22274.536621004565
Model 2 has MAE equal to 22058.229965753428
Model 3 has MAE equal to 21892.431301369863
Model 4 has MAE equal to 22316.858275244977
Model 5 has MAE equal to 22073.215404626728


In [14]:
best_model = model_3

In [15]:
my_model = best_model

In [16]:
# Fit chosen model with all training data
my_model.fit(X,y)

RandomForestRegressor(criterion='mae', random_state=33)

In [23]:
# Predict using test data and save output to csv
preds_test = my_model.predict(X_test)

output = pd.DataFrame({'Id':X_test.index, 'SalePrice':preds_test})

output.to_csv('housing_prices_competition/submission.csv', index=False)