In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

In [None]:
file_path = './houses.csv'

In [None]:
data = pd.read_csv(file_path)

In [None]:
data.head(6)

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = data[features]
X.head()

In [None]:
X.describe()

In [None]:
y = data.Price

In [None]:
train_X, validation_X, train_y, validation_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
train_X.head()

In [None]:
train_y.head()

In [None]:
train_X.describe()

In [None]:
validation_X.head()

In [None]:
validation_X.describe()

In [None]:
model = DecisionTreeRegressor(max_leaf_nodes=10)

In [None]:
model.fit(train_X, train_y)

In [None]:
print(model.predict(X))

In [None]:
validation_predictions = model.predict(validation_X)

In [None]:
df = pd.DataFrame({'Actual': validation_y, 'Predicted': validation_predictions})
df.head(10)

In [None]:
print(mean_absolute_error(validation_y, validation_predictions))

In [None]:
from sklearn import tree

In [None]:
tree.export_graphviz(model, out_file='tree_data' , feature_names=features, filled = True)

In [None]:
# compare same model with different parameters ... for example max leaf nodes in Decision tree ...
def get_mae(max_leaf_nodes, train_X, validation_X, train_y, validation_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(validation_X)
    mae = mean_absolute_error(validation_y, preds_val)
    return(mae)

In [None]:
candidate_max_leaf_nodes = [4, 5, 25, 50, 100, 250, 500, 700, 1000]
# A loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, validation_X, train_y, validation_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
model.fit(train_X, train_y)
preds_val = model.predict(validation_X)
mae = mean_absolute_error(validation_y, preds_val)
print("Linear regression - Mean Absolute Error:  %d" %(mae))

In [None]:
coeff_df = pd.DataFrame(model.coef_, features, columns=['Coefficient'])  
coeff_df

In [None]:
print(model.coef_)

In [None]:
# predicted_price = coff1 * Rooms + coff2 * Bathroom + coff3 * landsize + coff4 * lattitude + coff5 * longitude