In [None]:
import pandas as pd

In [None]:
# Read the dataset to a variable
melb_data = pd.read_csv("melb_data.csv")
# Reading the first five rows of the dataset
melb_data.head()

In [None]:
melb_data.describe().transpose()
# Discovered that: "count" - shows how many rows have non-missing values
# "mean" - is the average of the numerical data
# "std" - is the standard deviation of the numerical data
# min, 25%, 50%, 75%, max are the levels of percentile and the minimum and maximum values.

In [None]:
# To views all columns in a dataset
melb_data.columns

In [None]:
# dropna drops missing values (think of na as "not available")
melb_data_new = melb_data.dropna(axis = 0)

In [None]:
melb_data_new.describe().transpose()

In [None]:
# You can pull out a variable with dot-notation
# The value of y is what we call the "Prediction Target" and we use dot-notation to select the column for that
y = melb_data_new.Price

In [None]:
# Creating independent features for y
melb_features = ["Rooms", "Bathroom", "Landsize", "Lattitude", "Longtitude"]
X = melb_data_new[melb_features]

In [None]:
X.describe().transpose()

In [None]:
X.head().transpose()

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
melb_model = DecisionTreeRegressor(random_state=1)
melb_model.fit(X, y)

In [None]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melb_model.predict(X.head()))

In [None]:
# With the MAE metric, we take the absolute value of each error. This converts each error to a positive number. 
# We then take the average of those absolute errors. This is our measure of model quality.
from sklearn.metrics import mean_absolute_error

In [None]:
# Assigning the prediction to a variable
predicted_melb_model = melb_model.predict(X)

In [None]:
# The Mean Absolute Error
mean_absolute_error(y, predicted_melb_model)

In [None]:
# Since models' practical value come from making predictions on new data, we measure performance on data 
# that wasn't used to build the model. The most straightforward way to do this is to exclude some data from 
# the model-building process, and then use those to test the model's accuracy on data it hasn't seen before. 
# This data is called validation data.

In [None]:
# The scikit-learn library has a function train_test_split to break up the data into two pieces. 
# We'll use some of that data as training data to fit the model, and we'll use the other data as 
# validation data to calculate mean_absolute_error.
from sklearn.model_selection import train_test_split

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=101)

In [None]:
    melb_model2 = DecisionTreeRegressor(random_state=101)
melb_model2.fit(train_X, train_y)

In [None]:
test_prediction = melb_model2.predict(test_X)

In [None]:
print(mean_absolute_error(test_y, test_prediction))

In [None]:
# This is a phenomenon called overfitting, where a model matches the training data almost perfectly, 
# but does poorly in validation and other new data.

In [None]:
#  When a model fails to capture important distinctions and patterns in the data, so it performs poorly 
# even in training data, that is called underfitting.

In [None]:
def get_mae(max_leaf_nodes, train_X, train_y, test_X, test_y):
    model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=101)
    model.fit(train_X, train_y)
    test_predict=model.predict(test_X)
    mae=mean_absolute_error(test_y, test_predict)
    return(mae)

In [None]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, train_y, test_X, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

In [None]:
# Here's the takeaway: Models can suffer from either:
# Overfitting: capturing spurious patterns that won't recur in the future, leading to less accurate predictions, or
# Underfitting: failing to capture relevant patterns, again leading to less accurate predictions.
# We use validation data, which isn't used in model training, to measure a candidate model's accuracy. 
# This lets us try many candidate models and keep the best one.

In [None]:
# Submitted my participation in a House Prediction Competition Using Random Forest Regressor as my selected Model

In [None]:
!git add .
!git commit -m "Handling Overfitting and Underfitting"
!git push