In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import pandas as pd

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    """
    Calculates the mean absolute error between predicted and actual values.
    """
    # max_lead_nodes limits how many leaves (final prediction groups) a decision tree can have
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

# Load data
iowa_file_path = '/Users/duey/CodingProjects/KaggleIntroML/data/train.csv'
home_data = pd.read_csv(iowa_file_path)

# Choose target and features
y = home_data.SalePrice # target
features = [ # features
    'LotArea',
    'YearBuilt',
    '1stFlrSF',
    '2ndFlrSF',
    'FullBath',
    'BedroomAbvGr',
    'TotRmsAbvGrd'
]
X = home_data[features] # subset of features in home_data

# Split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Specify model
iowa_model = DecisionTreeRegressor(random_state = 1)

# Fit model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae)) #{:,.0f} formats the number with commas and no decimal points

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
my_mae = {}

# Loop to find ideal tree size from candidate_max_leaf_nodes
for leaf_size in candidate_max_leaf_nodes:
    mae = get_mae(leaf_size, train_X, val_X, train_y, val_y)
    my_mae[leaf_size] = mae
    # 5: 35190.33670788684
    # 25: 28501.887126575195
    # 50: 27825.888386265695 
    # 100: 28653.10992820276 
    # 250: 31738.366204184345 
    # 500: 32662.00407479887

# get the best tree size 
best_tree_size = min(my_mae, key=my_mae.get) 
best_mae = my_mae[best_tree_size]
# my_mae is what we are interating over, key=my_mae.get is what we are telling the min() function to compare
print(f"Best tree size: {best_tree_size}")
print("Best MAE: {:,.0f}".format(best_mae))

# Fit model with best tree size using all data
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

# Make final predictions
final_predictions = final_model.predict(X)
final_mae = mean_absolute_error(final_predictions, y)
print("Final MAE: {:,.0f}".format(final_mae))
# Validation MAE: 32,966
# Final MAE: 20,289
# The model was $20,289 off on average, but better than the validation data by $12,717





Validation MAE: 32,966
Best tree size: 50
Best MAE: 27,826
Final MAE: 20,289
