<a href="https://colab.research.google.com/github/kenscwong/Property_DecisionTree/blob/master/HKProperty_DecisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
#Data Source- https://www.housingauthority.gov.hk/en/home-ownership/hos-secondary-market/transaction-records/index.html#
#Model Code Reference - https://www.python-course.eu/Regression_Trees.php

"""
Make the imports of python packages needed
"""
import pandas as pd
import numpy as np
import math
from pprint import pprint


import matplotlib.pyplot as plt
from matplotlib import style
style.use("fivethirtyeight")
#Import the dataset and define the feature and target columns#
dataset = pd.read_csv("data.csv",usecols=['Location','Court','Size','Floor','A_S','Price_per_size']).sample(frac=1)
mean_data = np.mean(dataset.iloc[:,-1])
###########################################################################################################
###########################################################################################################
"""
Calculate the varaince of a dataset
This function takes three arguments.
1. data = The dataset for whose feature the variance should be calculated
2. split_attribute_name = the name of the feature for which the weighted variance should be calculated
3. target_name = the name of the target feature. The default for this example is "Price_per_size"
"""    
def var(data,split_attribute_name,target_name="Price_per_size"):
    
    feature_values = np.unique(data[split_attribute_name])
    feature_variance = 0
    for value in feature_values:
        #Create the data subsets --> Split the original data along the values of the split_attribute_name feature
        # and reset the index to not run into an error while using the df.loc[] operation below
        subset = data.query('{0}=={1}'.format(split_attribute_name,value)).reset_index()
        #Calculate the weighted variance of each subset            
        value_var = (len(subset)/len(data))*np.var(subset[target_name],ddof=1)
        #Calculate the weighted variance of the feature
        feature_variance+=value_var
    return feature_variance
    
###########################################################################################################
###########################################################################################################
def Classification(data,originaldata,features,min_instances,target_attribute_name,parent_node_class = None):
    """
    Classification Algorithm: This function takes the same 5 parameters as the original classification algorithm in the
    previous chapter plus one parameter (min_instances) which defines the number of minimal instances
    per node as early stopping criterion.
    """   
    #Define the stopping criteria --> If one of this is satisfied, we want to return a leaf node#
    
    #########This criterion is new########################
    #If all target_values have the same value, return the mean value of the target feature for this dataset
    if len(data) <= int(min_instances):
        return np.mean(data[target_attribute_name])
    #######################################################
    
    #If the dataset is empty, return the mean target feature value in the original dataset
    elif len(data)==0:
        return np.mean(originaldata[target_attribute_name])
    
    #If the feature space is empty, return the mean target feature value of the direct parent node --> Note that
    #the direct parent node is that node which has called the current run of the algorithm and hence
    #the mean target feature value is stored in the parent_node_class variable.
    
    elif len(features) ==0:
        return parent_node_class
    
    #If none of the above holds true, grow the tree!
    
    else:
        #Set the default value for this node --> The mean target feature value of the current node
        parent_node_class = np.mean(data[target_attribute_name])
        #Select the feature which best splits the dataset
        item_values = [var(data,feature) for feature in features] #Return the variance for features in the dataset
        best_feature_index = np.argmin(item_values)
        best_feature = features[best_feature_index]
        
        #Create the tree structure. The root gets the name of the feature (best_feature) with the minimum variance.
        tree = {best_feature:{}}
        
        
        #Remove the feature with the lowest variance from the feature space
        features = [i for i in features if i != best_feature]
        
        #Grow a branch under the root node for meach possible value of the root node feature
        
        for value in np.unique(data[best_feature]):
            value = value
            #Split the dataset along the value of the feature with the lowest variance and therewith create sub_datasets
            sub_data = data.where(data[best_feature] == value).dropna()
            
            #Call the Calssification algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!
            subtree = Classification(sub_data,originaldata,features,min_instances,'Price_per_size',parent_node_class = parent_node_class)
            
            #Add the sub tree, grown from the sub_dataset to the tree under the root node
            tree[best_feature][value] = subtree
            
        return tree   
    
    
###########################################################################################################
###########################################################################################################
 
"""
Predict query instances
"""
    
def predict(query,tree,default = mean_data):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]] 
            except:
                return default
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result
        
###########################################################################################################
###########################################################################################################
"""
Create a training as well as a testing set
"""
def train_test_split(dataset):
    training_data = dataset.iloc[:int(0.7*len(dataset))].reset_index(drop=True)#We drop the index respectively relabel the index
    #starting form 0, because we do not want to run into errors regarding the row labels / indexes
    testing_data = dataset.iloc[int(0.7*len(dataset)):].reset_index(drop=True)
    return training_data,testing_data
training_data = train_test_split(dataset)[0]
testing_data = train_test_split(dataset)[1] 
###########################################################################################################
###########################################################################################################
"""
Compute the RMSE 
"""
def test(data,tree):
    #Create new query instances by simply removing the target feature column from the original dataset and 
    #convert it to a dictionary
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    
    #Create a empty DataFrame in whose columns the prediction of the tree are stored
    predicted = []
    #Calculate the RMSE
    for i in range(len(data)):
        predicted.append(predict(queries[i],tree,mean_data)) 
    RMSE = np.sqrt(np.sum(((data.iloc[:,-1]-predicted)**2)/len(data)))
    return RMSE
###########################################################################################################
###########################################################################################################  
    
"""
Train the tree, Print the tree and predict the accuracy
"""





tree = Classification(training_data,training_data,training_data.columns[:-1],10,'Price_per_size')
pprint(tree)
print('#'*50)
print('Root mean square error (RMSE): ',test(testing_data,tree))





{'Location': {1: {'Size': {'300': 6925.0,
                           '400': {'Court': {87.0: 7607.888888888889,
                                             150.0: 6889.2}},
                           '500': {'Court': {4.0: 12317.6,
                                             87.0: 8085.666666666667,
                                             115.0: 10327.0,
                                             150.0: 6598.0}},
                           '600': 10718.857142857143}},
              2: 9025.0,
              3: {'Size': {'400': 6867.666666666667,
                           '500': {'Court': {7.0: 8680.6,
                                             28.0: 8134.5,
                                             39.0: 9671.0,
                                             40.0: 8330.666666666666,
                                             61.0: 8782.0,
                                             89.0: 9105.25,
                                             140.0: 6722.666666666667}},
  

# Make Decision

In [0]:
###########################################################################################################
########################################################################################################### 


def roundup(x):
     return int(math.ceil(x / 100.0)) * 100


def decide(location, court, size, floor, agent, your_price, tree, mean_data):
  
  if floor < 14: 
    floor = 1
    s_floor = 'L-1'
  elif floor >=27: 
    floor = 3 
    s_floor = 'H-3'
  else: 
    floor = 2
    s_floor = 'M-2'
  
  size = roundup(size)
  s_size = str(size)
  
  print ('Location       : {:d}'.format(location));
  print ('Court          : {:d}'.format(court));
  print ('Size           : {:d}-{:d}'.format(size-99, size));
  print ('floor          : {:s}'.format(s_floor));
  print ('Agent          : {:d}'.format(agent));
  print ('' );
  
  predict_data = pd.DataFrame(columns = ['Location','Court','Size','Floor','A_S'])
  predict_data.loc[0] = [location, court, s_size, floor, agent]
  
  queries1 = predict_data.to_dict(orient = "records")

  predict_price= predict(queries1[0],tree,mean_data)
 
  print('Your price     : {:f}'.format(your_price))
  print('Estimated price: {:f}'.format(predict_price))
    
  if (predict_price >= your_price):
    decison = 'YES'
  else:
    decison = 'NO'
   
 
  return decison
###########################################################################################################
########################################################################################################### 





# Case 1

In [19]:
#	Location : Ma On Shan - 20
#	Court : Kam Tai Court - 64
#	Size :  650
#	Floor: 18/F
#	Agent(1) or Self Negotiation(2) : Agent - 1

#	To compare Your Price per sqft: 10000

decide(20,64,650,18,1,9700, tree, mean_data)

Location       : 20
Court          : 64
Size           : 601-700
floor          : M-2
Agent          : 1

Your price     : 9700.000000
Estimated price: 9818.333333


'YES'

# Case 2

In [20]:
#	Location : Ma On Shan -20
#	Court : Kam Fung Court - 62
#	Size :  530
#	Floor: 9/F
#	Agent(1) or Self Negotiation(2) : Agent - 1

# To Compare Your Price per sqft: 8000

decide(20,62,530,9,1,8000, tree, mean_data)

Location       : 20
Court          : 62
Size           : 501-600
floor          : L-1
Agent          : 1

Your price     : 8000.000000
Estimated price: 8409.750000


'YES'