In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
# initialize dataframe with test data
df_train = pd.read_csv('train.csv')

df_train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


## Initial examination:
* Dataset is outdated by 11 years
* Houses are sold between 2006 - 2010 (during housing crash)

In [3]:
# initial exploration
avg_lot_size = df_train['LotArea'].mean()
newest_home_age = 2021 - df_train['YearBuilt'].max()

avg_lot_size, newest_home_age

# Min/Max of YearBuilt and YrSold
min_year_built = df_train['YearBuilt'].min()
max_year_built = df_train['YearBuilt'].max()
min_year_sold = df_train['YrSold'].min()
max_year_sold = df_train['YrSold'].max()

(min_year_built, max_year_built),(min_year_sold, max_year_sold)

((1872, 2010), (2006, 2010))

In [4]:
years_features = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
years_data = df_train[years_features]
years_data.describe()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt,YrSold
count,1460.0,1460.0,1379.0,1460.0
mean,1971.267808,1984.865753,1978.506164,2007.815753
std,30.202904,20.645407,24.689725,1.328095
min,1872.0,1950.0,1900.0,2006.0
25%,1954.0,1967.0,1961.0,2007.0
50%,1973.0,1994.0,1980.0,2008.0
75%,2000.0,2004.0,2002.0,2009.0
max,2010.0,2010.0,2010.0,2010.0


In [5]:
years_data.head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt,YrSold
0,2003,2003,2003.0,2008
1,1976,1976,1976.0,2007
2,2001,2002,2001.0,2008
3,1915,1970,1998.0,2006
4,2000,2000,2000.0,2008


## Building a Model - Decision Tree
* Select prediction target *y*
* Choose features (set of *X*)
* Build model
* Validate model using mean absolute error (MAE)
* Compare different tree sizes

In [6]:
y = df_train.SalePrice

feature_names = ['BedroomAbvGr', 'FullBath', 'LotArea']
X = df_train[feature_names]
X.describe()

Unnamed: 0,BedroomAbvGr,FullBath,LotArea
count,1460.0,1460.0,1460.0
mean,2.866438,1.565068,10516.828082
std,0.815778,0.550916,9981.264932
min,0.0,0.0,1300.0
25%,2.0,1.0,7553.5
50%,3.0,2.0,9478.5
75%,3.0,2.0,11601.5
max,8.0,3.0,215245.0


In [7]:
train_model = DecisionTreeRegressor(random_state = 1)
train_model.fit(X, y)

DecisionTreeRegressor(random_state=1)

In [8]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are:")
print(train_model.predict(X.head()))

Making predictions for the following 5 houses:
   BedroomAbvGr  FullBath  LotArea
0             3         2     8450
1             3         2     9600
2             3         2    11250
3             3         1     9550
4             4         2    14260
The predictions are:
[195625. 167125. 223500. 140000. 250000.]


In [9]:
print("Actual prices:")
print(df_train.SalePrice.head())

Actual prices:
0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64


In [10]:
feature_names = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = df_train[feature_names]
X.describe()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,1971.267808,1162.626712,346.992466,1.565068,2.866438,6.517808
std,9981.264932,30.202904,386.587738,436.528436,0.550916,0.815778,1.625393
min,1300.0,1872.0,334.0,0.0,0.0,0.0,2.0
25%,7553.5,1954.0,882.0,0.0,1.0,2.0,5.0
50%,9478.5,1973.0,1087.0,0.0,2.0,3.0,6.0
75%,11601.5,2000.0,1391.25,728.0,2.0,3.0,7.0
max,215245.0,2010.0,4692.0,2065.0,3.0,8.0,14.0


In [11]:
train_model.fit(X, y)

DecisionTreeRegressor(random_state=1)

In [12]:
print("Making predictions for the following 5 houses: ", X.head())
print("The predictions are: ", train_model.predict(X.head()))
print("Actual target values for those homes: ", y.head().tolist())

Making predictions for the following 5 houses:     LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
0     8450       2003       856       854         2             3   
1     9600       1976      1262         0         2             3   
2    11250       2001       920       866         2             3   
3     9550       1915       961       756         1             3   
4    14260       2000      1145      1053         2             4   

   TotRmsAbvGrd  
0             8  
1             6  
2             6  
3             7  
4             9  
The predictions are:  [208500. 181500. 223500. 140000. 250000.]
Actual target values for those homes:  [208500, 181500, 223500, 140000, 250000]


In [13]:
# calcalate mean absolute error
predicted_home_prices = train_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

62.35433789954339

In [14]:
# split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

train_model = DecisionTreeRegressor(random_state=1)
train_model.fit(train_X, train_y)

# make validation predictions
val_predictions = train_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

32966.449315068494


In [15]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [25]:
candidate_max_leaf_nodes = [5, 50, 500, 5000]

scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}

best_tree_size = min(scores, key=scores.get)
scores

{5: 35190.33670788684,
 50: 27825.888386265695,
 500: 32345.501889541203,
 5000: 32677.51506849315}

In [21]:
# using best value for max_leaf_nodes
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=50, random_state=1)

## Building a Model - Random Forests
* Select prediction target *y*
* Choose features (set of *X*)
* Build model
* Validate model using mean absolute error (MAE)
* Compare different tree sizes

In [23]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, preds))

23009.206570906717
