In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style()

from datetime import datetime

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.float_format = '{:,.2f}'.format

import os
print(os.listdir("./csv"))

['test.csv', 'train.csv', 'sample_submission.csv']


In [2]:
# Based on the Kaggle
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [3]:
trainData = pd.read_csv("./csv/train.csv", sep = ',')
testData = pd.read_csv("./csv/test.csv", sep = ',')

In [4]:
trainData.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.9,70.05,10516.83,6.1,5.58,1971.27,1984.87,103.69,443.64,...,94.24,46.66,21.95,3.41,15.06,2.76,43.49,6.32,2007.82,180921.2
std,421.61,42.3,24.28,9981.26,1.38,1.11,30.2,20.65,181.07,456.1,...,125.34,66.26,61.12,29.32,55.76,40.18,496.12,2.7,1.33,79442.5
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [6]:
def preprocessData(dataSet, test = False)
    newSet = pd.DataFrame()
    newSet['OverallQual'] = dataSet['OverallQual']
    newSet['GrLivArea'] = dataSet['GrLivArea']
    newSet['TotalBsmtSF'] = dataSet['TotalBsmtSF']
    newSet['GarageCars'] = dataSet['GarageCars']
    newSet['PoolArea'] = dataSet['PoolArea']
    newSet['YearBuilt'] = dataSet['YearBuilt']
    newSet['FullBath'] = dataSet['FullBath']
    newSet['TotRmsAbvGrd'] = dataSet['TotRmsAbvGrd']
    if not test:
        newSet['SalePrice'] = dataSet['SalePrice']
    return newSet

In [7]:
newTrainData = preprocessData(trainData)
newTrainData = newTrainData.reindex(np.random.permutation(newTrainData.index))

In [8]:
training_set = newTrainData.sample(frac = 0.8, replace = False)
validate_set = newTrainData.loc[set(newTrainData.index) - set(training_set.index)]

In [9]:
def SelectedFeatures():
    featureColumns = []
    featureColumns.append(tf.feature_column.numeric_column('OverallQual'))
    featureColumns.append(tf.feature_column.numeric_column('GrLivArea'))
    featureColumns.append(tf.feature_column.numeric_column('TotalBsmtSF'))
    featureColumns.append(tf.feature_column.numeric_column('GarageCars'))
    featureColumns.append(tf.feature_column.numeric_column('PoolArea'))
    featureColumns.append(tf.feature_column.numeric_column('YearBuilt'))
    featureColumns.append(tf.feature_column.numeric_column('FullBath'))
    featureColumns.append(tf.feature_column.numeric_column('TotRmsAbvGrd'))
    return featureColumns

In [16]:
def training(training_set, validate_set, periods, steps, name):
    my_optimizer = tf.train.ProximalAdagradOptimizer(
      learning_rate = 0.003,
      l1_regularization_strength = 0.001
    )
    estimator = tf.estimator.DNNRegressor(
        feature_columns = SelectedFeatures(),
        hidden_units = [1024, 128, 32],
        optimizer = my_optimizer,
        model_dir = name
    )
    
    training_target, training_features = training_set['SalePrice'], training_set.drop('SalePrice', axis = 1)
    validate_target, validate_features = validate_set['SalePrice'], validate_set.drop('SalePrice', axis = 1)
    
    train_fn = tf.estimator.inputs.pandas_input_fn(
        x = training_features,
        y = training_target,
        batch_size = 30,
        num_epochs = None,
        shuffle = True,
    )
    evalu_fn = tf.estimator.inputs.pandas_input_fn(
        x = validate_features,
        y = validate_target,
        batch_size = 1,
        num_epochs = 1,
        shuffle = True,
    )
    
    step_period = steps / periods
    print ("Period: ", end = ' ')
    for period in range(periods):
        training = estimator.evaluate(input_fn = train_fn, steps = step_period)
        training = estimator.train(input_fn = train_fn, steps = step_period)
        print (period + 1, end = ' ')
    training = estimator.evaluate(input_fn = train_fn, steps = step_period)
    print ("\nDone !")
    return estimator

In [17]:
estimator = training(
    training_set = training_set,
    validate_set = validate_set,
    periods = 10,
    steps = 10000,
    name = "./DNNRegressor/Naive_model" # + datetime.now().strftime("%H_%M")
)

Period:  1 2 3 4 5 6 7 8 9 10 
Done !


In [12]:
testSet = preprocessData(testData, test = True)
predict_fn = tf.estimator.inputs.pandas_input_fn(
    x = testSet,
    y = None,
    batch_size = 1,
    num_epochs = 1,
    shuffle = False,
)
prediction = estimator.predict(input_fn = predict_fn)
# # output = pd.DataFrame(
# #     [(Id, SalePrice) for Id, SalePrice in zip(testData['Id'], prediction)],
# #     columns = ['Id', 'SalePrice']
# # )
# # output.to_csv("./output_0.csv")
# # print(np.array([item['predictions'][0] for item in prediction]))