In [1]:
# At first let's import pandas and read csv file
import pandas as pd 
df = pd.read_csv('melb_data.csv')

In [2]:
#Let's check columns of our Data Frame
df.columns
y = df.Price

In [3]:
# Let's create the list of features
feature_models = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [4]:
# Now i'll select data corresponding to features in feature_names
X = df[feature_models]

In [5]:
#Checking first 5 lines
print(X.head(5))

   Rooms  Bathroom  Landsize  Lattitude  Longtitude
0      2       1.0     202.0   -37.7996    144.9984
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
3      3       2.0      94.0   -37.7969    144.9969
4      4       1.0     120.0   -37.8072    144.9941


In [6]:
# let's use sklearn tree and create model
from sklearn.tree import DecisionTreeRegressor
melbourne_model = DecisionTreeRegressor(random_state=1)
melbourne_model.fit(X,y)

DecisionTreeRegressor(random_state=1)

In [7]:
#let's go next and create predictions
predictions = melbourne_model.predict(X)
print(predictions)

[1480000. 1035000. 1465000. ... 1170000. 2500000. 1285000.]


In [8]:
#comparing with 'y' value
y.head(10)

0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
5     941000.0
6    1876000.0
7    1636000.0
8     300000.0
9    1097000.0
Name: Price, dtype: float64

In [9]:
print("First in-sample predictions:", melbourne_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())

First in-sample predictions: [1480000. 1035000. 1465000.  850000. 1600000.]
Actual target values for those homes: [1480000.0, 1035000.0, 1465000.0, 850000.0, 1600000.0]


In [10]:
#now let's split up our data
from sklearn.model_selection import train_test_split
train_X,val_X, train_y, val_y = train_test_split(X,y, random_state=1)

In [11]:
# now let's specify and fit model with the training data
melbourne_model = DecisionTreeRegressor(random_state=1)
melbourne_model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

In [12]:
# In this step let's predict with all validation observations
val_predictions = melbourne_model.predict(val_X)

In [13]:
# print the top few validation predictions
print(val_predictions[:5])
# print the top few actual prices from validation data
print(val_y.head())

[2070000.  733000. 2810000.  820000. 2130000.]
321      1640000.0
4003      675000.0
13348    2800000.0
2697      615000.0
12600    2700000.0
Name: Price, dtype: float64


In [14]:
# now let's calculate the Mean Absolute Error in Validation Data
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y, val_predictions)
# let's see the validation_mae
print(val_mae)

241632.16966126655


In [15]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes,random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [16]:
# write loop to find the ideal tree size from candidate_max_leaf_nodes and then store the best value of max_leaf_nodes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)
print(best_tree_size)

500


In [17]:
# Fill in argument to make optimal size and uncomment and then fit the final model and uncomment the next two lines
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X,y)

DecisionTreeRegressor(max_leaf_nodes=500, random_state=1)

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
rf_model = RandomForestRegressor(random_state=1)

In [20]:
rf_model.fit(train_X, train_y)

RandomForestRegressor(random_state=1)

In [21]:
rf_pred = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_pred, val_y)
print('Validation MAE for Random Forest Model: {}'.format(rf_val_mae))

Validation MAE for Random Forest Model: 180544.06532524488


In [22]:
rf_model_on_full_data = RandomForestRegressor(random_state=1)
rf_model_on_full_data.fit(X,y)

RandomForestRegressor(random_state=1)

In [23]:
test_data = pd.read_csv('Melbourne_housing_FULL.csv')
test_data.dropna(subset=['Landsize'], inplace=True)
test_data.dropna(subset=['Lattitude'], inplace=True)
test_data.dropna(subset=['Longtitude'], inplace=True)
test_data.dropna(subset=['YearBuilt'], inplace=True)
test_data.dropna(subset=['Price'], inplace=True)
test_data.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car               154
Landsize            0
BuildingArea     1593
YearBuilt           0
CouncilArea         0
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [24]:
test_data.columns
features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
test_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [25]:
test_X = test_data[features]

In [26]:
test_preds = rf_model_on_full_data.predict(test_X)

In [27]:
output = pd.DataFrame({'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)