## Install Required Dependencies
This solution needs several external dependencies. Following will install required dependencies

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
import joblib as jl

Load the required dataset into memory

In [22]:
df = pd.read_csv("ml_house_data_set.csv")

## Data Cleaning and Manipulation Steps

In [23]:
# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

In [24]:
# Remove the sale price from the feature dataset since the 'sale_price' is the class
del features_df['sale_price']

In [30]:
# Create arrays with both variable and classes
X = features_df.to_numpy()
y = df['sale_price'].to_numpy()

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(29892, 63)
(12811, 63)
(29892,)
(12811,)


## Prepare the model and Train

In [31]:
# Selected the ensamble gradient boosting regressor model as the machine learning model
model = ensemble.GradientBoostingRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=9,
    max_features=0.1,
    loss='huber',
    random_state=0
)

# Train the model with train dataset
model.fit(X_train, y_train)

# Save the trained model to a file so we can use it in other programs
jl.dump(model, 'trained_house_classifier_model.pkl')

['trained_house_classifier_model.pkl']

### Perform Evaluations
Perform evaluations against the model using the test dataset

In [32]:
# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 43659.7847
Test Set Mean Absolute Error: 58612.9516
