In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn import tree

In [None]:
# Step 1 : Read our input data descriping houses
file_path = './houses.csv'
data = pd.read_csv(file_path)
data.head()

In [None]:
# Statistics about input data
data.describe()

In [None]:
# data.columns
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
full_input_data = data[features]
full_output_data = data.Price

In [None]:
# Step 2 : Split the data into 2 parts (80% for training, 20% for validation)
training_input, validation_input, training_output, validation_output = train_test_split(full_input_data, full_output_data, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
# Step 3 : Create a simple model (Decision tree)
model = DecisionTreeRegressor(max_leaf_nodes=10)

In [None]:
# Step 4 : Train our model with 80% of the input data
model.fit(training_input, training_output)

In [None]:
# Step 5 : Validate our model (Measure how good is it performing using 20% of our data)
predictions = model.predict(validation_input)
df = pd.DataFrame({'Actual': validation_output, 'Predicted': predictions})
df.head(10)
# print(model.predict(full_input_data))

In [None]:
print(mean_absolute_error(validation_output, predictions))

In [None]:
# Step 6 : Visualize our tree/model
# tree.export_graphviz(model, out_file='tree_data' , feature_names=features, filled = True)

In [None]:
# Step 7 : Create similar models with different configs and validate them
def get_mae(max_leaf_nodes, train_X, validation_X, train_y, validation_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(validation_X)
    mae = mean_absolute_error(validation_y, preds_val)
    return(mae)


candidate_max_leaf_nodes = [4, 5, 25, 50, 100, 250, 500, 700, 1000]
# A loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, training_input, validation_input, training_output, validation_output)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

In [None]:
# Step 8 : Create a new different model (linear regression)
model = LinearRegression()
model.fit(training_input, training_output)
predictions_for_linear_regression = model.predict(validation_input)
# And validate it
mae = mean_absolute_error(validation_output, predictions_for_linear_regression)
print("Linear regression - Mean Absolute Error:  %d" %(mae))

In [None]:
# Step 9 : Understand the coeffecients of linear regression model
coeff_df = pd.DataFrame(model.coef_, features, columns=['Coefficient'])
print(model.coef_)
# predicted_price = coff1 * Rooms + coff2 * Bathroom + coff3 * landsize + coff4 * lattitude + coff5 * longitude