This notebook is a summary of ML courses.
https://www.kaggle.com/learn/intro-to-machine-learning

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/melbourne-housing-snapshot/melb_data.csv


some convensions:
* the prediction target is called y
* the features set is called X

# 1. ML model building

In [2]:
#step: get the dataset and clean 'na' values
melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data = melbourne_data.dropna(axis=0)
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [3]:
#step: select the prediction target
y = melbourne_data.Price

#step: choosing features:
#melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [4]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [5]:
from sklearn.tree import DecisionTreeRegressor

#builing ML model
# 1. Define
melbourne_model = DecisionTreeRegressor(random_state=1)

# 2.fit
melbourne_model.fit(X,y)

DecisionTreeRegressor(random_state=1)

In [6]:
# 3. predict
melbourne_sample = X.head()
melbourne_predict = melbourne_model.predict(melbourne_sample)

print('Prediction for sample houses:')
print(melbourne_sample)
print('are:')
print(melbourne_predict)

Prediction for sample houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
are:
[1035000. 1465000. 1600000. 1876000. 1636000.]


# 2. ML model validation

In [7]:
# MAE - Mean Absolute Error
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)

#"in-sample" score -> Validation within predicted values
mean_absolute_error(y,predicted_home_prices)


1115.7467183128902

In [8]:
# solution for "in-sample" score: split into train/test samples ("out-of-sample")
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 0)

melbourne_model = DecisionTreeRegressor()

#fit to train
melbourne_model.fit(train_X, train_y)

#predict to test
test_predictions = melbourne_model.predict(test_X)

print(mean_absolute_error(test_y, test_predictions))



276495.49494297395


# 3. Controlling tree depth
- **overfitting**: few samples per group -> model matches the training data almost perfectly -> poor for validation and new data
- **underfitting**: many samples per group -> model matches the training data poorly

In [9]:
#function to help compare MAE from different values of max_leaf_nodes
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(test_X)
    mae = mean_absolute_error(test_y, preds_val)
    return(mae)

for max_leaf_nodes in [10, 100, 500, 1000, 10000]:
    my_mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 10  		 Mean Absolute Error:  333908
Max leaf nodes: 100  		 Mean Absolute Error:  269191
Max leaf nodes: 500  		 Mean Absolute Error:  261718
Max leaf nodes: 1000  		 Mean Absolute Error:  262426
Max leaf nodes: 10000  		 Mean Absolute Error:  271996


# 4. Using Random Forest Model

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

melbourne_f_model = RandomForestRegressor(random_state = 1)
melbourne_f_model.fit(train_X, train_y)
f_test_predictions = melbourne_f_model.predict(test_X)
f_mae = mean_absolute_error(test_y, f_test_predictions)

print(f_mae)

207190.6873773146
