In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [24]:
# Import training and test data
X_full = pd.read_csv("housing_prices_competition/train.csv", index_col='Id')
X_test_full = pd.read_csv("housing_prices_competition/test.csv", index_col='Id')

In [25]:
# Obtain target and predictor variables
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

In [4]:
# Create a validation set for the training data
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=33)

In [6]:
x_train.head(10)

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
821,6000,1953,936,0,1,2,4
856,10970,1978,1026,0,1,3,5
849,9600,1976,1094,761,2,3,7
854,17920,1955,1779,0,1,3,6
649,1936,1970,630,0,1,1,3
318,9900,1993,1372,1274,2,4,9
1363,8499,2006,616,796,2,3,6
176,10029,1988,1164,896,2,4,8
16,11241,1970,1004,0,1,2,5
1000,10206,1952,944,0,1,2,4


In [9]:
# Define Random Forest models

from sklearn.ensemble import RandomForestRegressor

model_1 = RandomForestRegressor(n_estimators=50, random_state=33)
model_2 = RandomForestRegressor(n_estimators=100, random_state=33)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=33)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=33)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=33)

models = [model_1, model_2, model_3, model_4, model_5]

In [12]:
# Set up scoring of resulting models for comparison
from sklearn.metrics import mean_absolute_error

def score_model(model, x_t=x_train, x_v=x_test, y_t=y_train, y_v=y_test):
    model.fit(x_t, y_t)
    return mean_absolute_error(y_v, model.predict(x_v))


In [13]:
for i in range(len(models)):
    print(f"Model {i+1} has MAE equal to {score_model(models[i])}")

Model 1 has MAE equal to 22274.536621004565
Model 2 has MAE equal to 22058.229965753428
Model 3 has MAE equal to 21892.431301369863
Model 4 has MAE equal to 22316.858275244977
Model 5 has MAE equal to 22073.215404626728


In [14]:
best_model = model_3

In [15]:
my_model = best_model

In [26]:
# Fit chosen model with all training data
my_model.fit(X,y)

RandomForestRegressor(criterion='mae', random_state=33)

In [27]:
# Predict using test data and save output to csv
preds_test = my_model.predict(X_test)

output = pd.DataFrame({'Id':X_test.index, 'SalePrice':preds_test})

output.to_csv('housing_prices_competition/submission.csv', index=False)

## Dealing with Missing Values

In [35]:
# Read the data
X_full = pd.read_csv('housing_prices_competition/train.csv', index_col='Id')
X_test_full = pd.read_csv('housing_prices_competition/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [36]:
X_train.shape

(1168, 36)

In [37]:
missing_val_count_by_column = X_train.isnull().sum()

In [38]:
missing_val_count_by_column[missing_val_count_by_column > 0]

LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64

In [45]:
# define the scoring function

from sklearn.metrics import mean_absolute_error

def score_model(x_t, x_v, y_t, y_v):
    model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=33)
    model.fit(x_t, y_t)
    return mean_absolute_error(y_v, model.predict(x_v))

In [46]:
# drop missing data

cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

X_train_reduced = X_train.drop(cols_with_missing, axis=1)
X_valid_reduced = X_valid.drop(cols_with_missing, axis=1)

In [47]:
print("Score with dropped columns:", score_model(X_train_reduced, X_valid_reduced, y_train, y_valid))

Score with dropped columns: 17612.083510273973


In [48]:
# Imputation 
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [49]:
print("Score with just imputation:", score_model(imputed_X_train, imputed_X_valid, y_train, y_valid))

Score with just imputation: 17879.109417808217


In [51]:
final_imputer = SimpleImputer(strategy='median')
final_imp_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
final_imp_X_valid = pd.DataFrame(final_imputer.transform(X_valid))
final_imp_X_train.columns = X_train.columns
final_imp_X_valid.columns = X_valid.columns

In [52]:
print("Score with just imputation:", score_model(final_imp_X_train, final_imp_X_valid, y_train, y_valid))

Score with just imputation: 17797.396352739725


In [56]:
model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=33)
model.fit(final_imp_X_train, y_train)
final_imp_X_test = pd.DataFrame(final_imputer.transform(X_test))
final_imp_X_test.columns = X_test.columns
preds_test = model.predict(final_imp_X_test)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('housing_prices_competition/submission.csv', index=False)