# numpy, pandas, sklearn; the bootstrap function np.random.choice with repetition.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def sub_samples(dataset, k, bootstrap = True):
    m, n = dataset.shape
    if bootstrap:
        indexes = np.random.choice(m, k)
        sub_samples = dataset[indexes]
    else:
        indexes = np.random.permutation(m)[:k]
        sub_samples = dataset[indexes]
    return sub_samples

# decision stump function

In [89]:
def split_dataset(dataset, split_feature, split_value):
    right_sub = dataset[np.nonzero(dataset[:, split_feature] > split_value)[0], :]
    left_sub = dataset[np.nonzero(dataset[:, split_feature] < split_value)[0], :]
    return right_sub, left_sub

# variance: the regression score

In [90]:
def regression_error_leaf(sub):
    return np.var(sub[:, -1])*sub.shape[0]

# mean value for each leaf

In [91]:
def leaf_value(sub):
    return np.mean(sub[:, -1])

# select the best feature: select NO. feature_num of features from the total NO. of features; if scores are too close, return no feature but the leaf mean value.

In [92]:
def select_best_feature(dataset, feature_num, key = 'regression'):
    n = dataset.shape[1]
    best_score = inf
    best_feature = 0
    best_value = 0
    indexes = []
    if key == 'regression':
        S = regression_error_leaf(dataset)
    for num in range(feature_num):
        indexes.append(np.random.randint(n - 1))
    for i in indexes:
        for j in set(dataset[:, i]):
            right_sub, left_sub = split_dataset(dataset, i, j)
            if key == 'regression':
                S_temp = regression_error_leaf(right_sub) + regression_error_leaf(left_sub)
            if S_temp < best_score:
                best_score = S_temp
                best_feature = i
                best_value = j
    if (S - best_score) < 0.001 and key == 'regression':
        return leaf_value(dataset), None
    return best_value, best_feature

# create a single tree, with max depth 20

In [93]:
def create_single_tree(dataset, feature_num, key = 'regression', max_depth = 20):
    best_value, best_feature = select_best_feature(dataset, feature_num, key = key)
    if best_feature == None:
        return best_value
    tree = {}
    max_depth -= 1
    if max_depth < 0:
        return leaf_value(dataset)
    right_sub, left_sub = split_dataset(dataset, best_feature, best_value)
    tree['best_value'] = best_value
    tree['best_feature'] = best_feature
    tree['right'] = create_single_tree(right_sub, feature_num, key, max_depth)
    tree['left'] = create_single_tree(left_sub, feature_num, key, max_depth)
    return tree

# create a random forest with n trees, k times of bootstrap, feature_num features.

In [94]:
def randomforest(dataset, n, k, feature_num, key = 'regression', bootstrap = True):
    forest = []
    for i in range(n):
        sub_dataset = sub_samples(dataset, k, bootstrap = bootstrap)
        forest.append(create_single_tree(sub_dataset, feature_num, key = key, max_depth = 20))
    return forest

# predict single tree using single data

In [95]:
def single_tree_predict(data, tree, key = 'regression'):
    if key == 'regression':
        if not isinstance(tree, dict):
            return float(tree)
        if data[tree['best_feature']] > tree['best_value']:
            if type(tree['right']) == 'float':
                return tree['right']
            else:
                return single_tree_predict(data, tree['right'], key)
        else:
            if type(tree['left']) == 'float':
                return tree['left']
            else:
                return single_tree_predict(data, tree['left'], key) 

# predict datasets using single tree; predict datasets using rf.

In [96]:
def single_tree_predict_dataset(dataset, tree, key = 'regression'):
    m, n = dataset.shape
    yhat = np.zeros((m, 1))
    for i in range(m):
        yhat[i, :] = single_tree_predict(dataset[i, :], tree, key = key)
    return yhat

def forest_predict_dataset(dataset, forest, key = 'regression'):
    m, n = dataset.shape
    yhat = np.zeros((m, 1))
    for tree in forest:
        yhat += single_tree_predict_dataset(dataset, tree, key = 'regression')
    yhat /= len(forest)
    return yhat        

# read in the training dataset, split it into training and cross validation.

In [97]:
training = pd.read_csv('D:/Program Files/machine learning/Kaggle Real Projects/house price prediction/train.csv')
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'SalePrice']
Training = training[features].values
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Training[:, :-1], Training[:, -1], test_size=0.33, random_state=42)
training_dataset = np.concatenate((Xtrain, Ytrain.reshape((-1, 1))), axis = 1)
cv_dataset = np.concatenate((Xtest, Ytest.reshape((-1, 1))), axis = 1)

# create the forsest and get the correct rate on cv dataset.

In [98]:
forest = randomforest(training_dataset, 100, len(training_dataset), 20, key = 'regression', bootstrap = True)
Yhat = forest_predict_dataset(cv_dataset, forest, key = 'regression')
np.sum(abs(Yhat[0] - Ytest) < 30000)/len(Ytest)

0.43775933609958506

# predict the price on testing dataset; write to csv final file.

In [105]:
testing = pd.read_csv('D:/Program Files/machine learning/Kaggle Real Projects/house price prediction/test.csv')
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt']
Testing = testing[features].values
Ypred = forest_predict_dataset(Testing, forest, key = 'regression')
final = pd.concat([testing['Id'], pd.DataFrame(Ypred)], axis = 1)
final.to_csv('D:/Program Files/machine learning/Kaggle Real Projects/house price prediction/prediction2.csv', index = False)