In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
# create dataframe of training data
df_train = pd.read_csv('train.csv')

## Initial exploration observations:
* Dataset is outdated by 11 years
* Houses are sold between 2006 - 2010 (during housing crash)

In [3]:
avg_lot_size = df_train.LotArea.mean()
newest_home_age = 2021 - df_train.YearBuilt.max()
print("Average Lot Size: %d \t\t Newest Home Age: %d" % (avg_lot_size, newest_home_age))

Average Lot Size: 10516 		 Newest Home Age: 11


In [4]:
# Min/Max of YearBuilt and YrSold
min_year_built = df_train['YearBuilt'].min()
max_year_built = df_train['YearBuilt'].max()
min_year_sold = df_train['YrSold'].min()
max_year_sold = df_train['YrSold'].max()

print("Min Year Built: %d \t\t Max Year Built: %d\n Min Year Sold: %d \t\t Max Year Sold: %d" % (min_year_built, max_year_built, min_year_sold, max_year_sold))

Min Year Built: 1872 		 Max Year Built: 2010
 Min Year Sold: 2006 		 Max Year Sold: 2010


## Build Random Forest model

## Building a Model - Decision Tree
* Select prediction target *y*
* Choose features (set of *X*)
* Build model
* Validate model using mean absolute error (MAE)
* Compare different tree sizes

In [5]:
# select prediction target
y = df_train.SalePrice

# select features
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = df_train[features]
X.describe()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,1971.267808,1162.626712,346.992466,1.565068,2.866438,6.517808
std,9981.264932,30.202904,386.587738,436.528436,0.550916,0.815778,1.625393
min,1300.0,1872.0,334.0,0.0,0.0,0.0,2.0
25%,7553.5,1954.0,882.0,0.0,1.0,2.0,5.0
50%,9478.5,1973.0,1087.0,0.0,2.0,3.0,6.0
75%,11601.5,2000.0,1391.25,728.0,2.0,3.0,7.0
max,215245.0,2010.0,4692.0,2065.0,3.0,8.0,14.0


In [6]:
# split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# intantiate model
train_model = DecisionTreeRegressor(random_state=1)
train_model.fit(train_X, train_y)

# generate validation predictions without specifying max_leaf_nodes
preds_val = train_model.predict(val_X)
print("Validation MAE when not specifying max_leaf_nodes: ", mean_absolute_error(val_y, preds_val))

Validation MAE when not specifying max_leaf_nodes:  29652.931506849316


In [7]:
# FUNCTION to validate the model using mean absolute error
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    return(mean_absolute_error(val_y, preds_val))

In [8]:
# compare different tree sizes
candidates_max_leaf_nodes = [5, 50, 500, 5000]

# generate scores for each and select the leaf size with the lowesst score
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidates_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)
print("Validation MAE for best value of max_leaf_nodes: \n", scores)

Validation MAE for best value of max_leaf_nodes: 
 {5: 35044.51299744237, 50: 27405.930473214907, 500: 28380.917944156296, 5000: 29001.372602739724}


In [10]:
# build random forest model
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)
rf_preds_val = rf_model.predict(val_X)
rf_mae_val = mean_absolute_error(rf_preds_val, val_y)
print("Validation MAE for Random Forest Model: ", rf_mae_val)

Validation MAE for Random Forest Model:  21857.15912981083


## Competition Model

In [14]:
# new rf model for all data in training data
rf_model_on_full_data = RandomForestRegressor(random_state=1)
rf_model_on_full_data.fit(X, y)

RandomForestRegressor(random_state=1)

In [18]:
test_data = pd.read_csv('test.csv')

# features for test data
test_X = test_data[features]

# predictions for submission
test_preds = rf_model_on_full_data.predict(test_X)

In [17]:
# competition scoring format. comment out otherwise

#output = pd.DataFrame({'Id': test_data.Id,
#                       'SalePrice': test_preds})
#output.to_csv('submission.csv', index=False)