In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
#from pylab import rcParams

import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 500)

#from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
#tf.logging.set_verbosity(tf.logging.INFO)
#sess = tf.InteractiveSession()

# Load inputs

In [None]:
X_train = pd.read_csv('train.csv')
print('Shape of the train data:', X_train.shape)
X_train.drop('Id', axis = 1, inplace=True)
X_train.head()

In [None]:
target_feature = pd.Index(['SalePrice'])
Y_train = X_train[target_feature].astype('float64')
X_train.drop(target_feature, axis=1, inplace=True)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [None]:
X_test = pd.read_csv('test.csv')
print('Shape of the test data:', X_test.shape)
X_test_id = X_test.Id
X_test.drop('Id', axis = 1, inplace=True)
X_test.head()

# Handle missing values

In [None]:
#print(X_train.shape, X_train.isnull().values.sum())
X_train_categorical = X_train.select_dtypes(include=np.object).fillna('NONE')
X_train_numeric = X_train.select_dtypes(exclude=np.object).fillna(0).astype('float64')
X_train = pd.concat([X_train_categorical, X_train_numeric], axis=1)
cat_features = X_train_categorical.columns
num_features = X_train_numeric.columns
#print(X_train.shape, X_train.isnull().values.sum())

In [None]:
print(type(X_train_numeric), X_train_numeric.shape)
dummy = X_test[X_train_numeric.columns]
print(type(dummy), dummy.shape)

In [None]:
#print(X_test.shape, X_test.isnull().values.sum())
X_test_categorical = X_test[X_train_categorical.columns].fillna('NONE')
X_test_numeric = X_test[X_train_numeric.columns].fillna(0)
X_test = pd.concat([X_test_categorical, X_test_numeric], axis=1)
#print(X_test.shape, X_test.isnull().values.sum())

# Outliners

In [None]:
def remove_outliers(df_numeric, methods=None, parameters={}):
    '''
    https://scikit-learn.org/stable/modules/outlier_detection.html#isolation-forest
    '''
    if methods=='IsolationForest':
        from sklearn.ensemble import IsolationForest
        clf = IsolationForest(max_samples = parameters.get('max_samples', 'auto'), 
                              random_state = parameters.get('random_state', 42), 
                              behaviour = parameters.get('behaviour', 'new'), 
                              contamination = parameters.get('contamination', 'auto'))
        isoforest_mask = clf.fit_predict(df_numeric)
        return isoforest_mask==1
    elif methods=='normal':
        from scipy import stats
        #check the axis value here https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame
        return (np.abs(stats.zscore(df, axis=1)) < 2).all(axis=0)
    else:
        return np.array(np.bool(np.ones(df_numeric.shape[0],1)))

In [None]:
print("Number of samples before removing outliner in train set:", X_train.shape[0])

In [None]:
isoforest_mask = remove_outliers(X_train_numeric, 'IsolationForest')
X_train = X_train.loc[isoforest_mask]
X_train.reset_index(drop = True, inplace = True)#drop=True: avoid the old index being added as a column
X_train_numeric = X_train_numeric.loc[isoforest_mask]
X_train_numeric.reset_index(drop = True, inplace = True) 
X_train_categorical = X_train_categorical.loc[isoforest_mask]
X_train_categorical.reset_index(drop = True, inplace = True)
Y_train = Y_train.loc[isoforest_mask]
Y_train.reset_index(drop = True, inplace = True)
print("Number of samples after removing outliner in train set:", X_train.shape[0])

# Scale (normalize or standardize) BOTH features and TARGET

Excellent discussion about data scaling:
https://machinelearningmastery.com/how-to-improve-neural-network-stability-and-modeling-performance-with-data-scaling/
It concludes that IN NEURON NETWORK REGRESSION we should scale not only the features but also the target 


The following post also has a nice illustration about the effect of data scaling. However, its conclusion seems WRONG - although it is safe to say "Normalizing the output will not affect shape of 𝑓", but large target y might result in large gradient --> parameters are updated with large values --> might explode
https://stats.stackexchange.com/questions/111467/is-it-necessary-to-scale-the-target-value-in-addition-to-scaling-features-for-re

In [None]:
scaling_type = 'MinMaxScaler' # 'StandardScaler'
if scaling_type == 'MinMaxScaler':
    from sklearn.preprocessing import MinMaxScaler
    input_scaler = MinMaxScaler()
    output_scaler = MinMaxScaler()
elif scaling_type == 'StandardScaler':
    from sklearn.preprocessing import StandardScaler
    input_scaler = StandardScaler()
    output_scaler = StandardScaler()
else: #not scale input/output
    input_scaler = None
    output_scaler = None
    
if input_scaler is not None:
    # fit scaler
    input_scaler.fit(X_train_numeric)
    # transform training input
    X_train_numeric = pd.DataFrame(input_scaler.transform(X_train_numeric), columns=num_features)
    # transform test input
    X_test_numeric = pd.DataFrame(input_scaler.transform(X_test_numeric), columns=num_features)

if output_scaler is not None:
    # fit scaler on training output
    output_scaler.fit(Y_train)
    # transform training output
    Y_train = pd.DataFrame(output_scaler.transform(Y_train), columns=target_feature)
    # inverse transform: output = output_scaler.inverse_transform(scaled_output)


# Set-up the network

In [None]:
import tensorflow as tf


In [None]:
tf_features =  [tf.contrib.layers.real_valued_column(numeric_feature) 
                        for numeric_feature in X_train_numeric.columns]

for categorical_feature in X_train_categorical.columns:
    temp = tf.contrib.layers.sparse_column_with_hash_bucket(categorical_feature, 
                                                            hash_bucket_size=1000) 
    tf_features.append(tf.contrib.layers.embedding_column(sparse_id_column=temp, 
                                                          dimension=16,combiner="sum")) 

In [None]:
# split the original training data into train and dev
from sklearn.model_selection import train_test_split
x_train, x_dev, y_train, y_dev = train_test_split(X_train.values, Y_train.values, 
                                                  test_size=0.33, random_state=42)
# train
y_train = pd.DataFrame(y_dev, columns = Y_train.columns)
xy_train = pd.DataFrame(x_train, columns = X_train.columns).merge(y_train, left_index = True, right_index = True)
# dev
y_dev = pd.DataFrame(y_dev, columns = Y_train.columns)
xy_dev = pd.DataFrame(x_dev, columns = X_train.columns).merge(y_dev, left_index = True, right_index = True)

In [None]:
'''
# Training for submission
training_sub = training_set[FEATURES + FEATURES_CAT]
testing_sub = test[FEATURES + FEATURES_CAT]

training_set[FEATURES_CAT] = training_set[FEATURES_CAT].applymap(str)
testing_set[FEATURES_CAT] = testing_set[FEATURES_CAT].applymap(str)
'''

In [None]:
def input_fn_new(data_set, training = True):
    
    continuous_cols = {k: tf.constant(data_set[k].values) for k in num_features}
    
    categorical_cols = {k: tf.SparseTensor(
        indices = [[i, 0] for i in range(data_set[k].size)], 
        values = data_set[k].values, 
        dense_shape = [data_set[k].size, 1]) for k in cat_features}

    # Merges the two dictionaries
    feature_cols = dict(list(continuous_cols.items()) + list(categorical_cols.items()))
    
    if training == True:
        label = tf.constant(data_set[LABEL].values)
        return feature_cols, label
    
    return feature_cols

# Learn and apply the model

In [None]:
# Model
regressor = tf.contrib.learn.DNNRegressor(feature_columns = tf_features, 
                                          activation_fn = tf.nn.relu, hidden_units=[200, 100, 50, 25, 12])

In [None]:
# learn the network given training data
regressor.fit(input_fn = lambda: input_fn_new(xy_train) , steps=2000)

In [None]:
ev = regressor.evaluate(input_fn=lambda: input_fn_new(xy_dev, training = True), steps=1)

In [None]:
loss_dev = ev["loss"]
print("Loss on the dev set: {0:f}".format(loss_dev))

In [None]:
yhat_testing = regressor.predict(input_fn=lambda: input_fn_new(Y_test))
predictions = list(itertools.islice(y, testing_set.shape[0]))
predictions = pd.DataFrame(prepro_y.inverse_transform(np.array(predictions).reshape(434,1)))

In [None]:
matplotlib.rc('xtick', labelsize=30) 
matplotlib.rc('ytick', labelsize=30) 

fig, ax = plt.subplots(figsize=(50, 40))

plt.style.use('ggplot')
plt.plot(predictions.values, reality.values, 'ro')
plt.xlabel('Predictions', fontsize = 30)
plt.ylabel('Reality', fontsize = 30)
plt.title('Predictions x Reality on dataset Test', fontsize = 30)
ax.plot([reality.min(), reality.max()], [reality.min(), reality.max()], 'k--', lw=4)
plt.show()

In [None]:
y_predict = regressor.predict(input_fn=lambda: input_fn_new(testing_sub, training = False))

In [None]:
to_submit(y_predict, "submission_cont_categ")

# Conclusion

In [None]:
list_score = [loss_score1, loss_score2, loss_score3, loss_score4,loss_score5]

In [None]:
import matplotlib.pyplot as plt; plt.rcdefaults()

plt.style.use('ggplot')
objects = list_model
y_pos = np.arange(len(objects))
performance = list_score
 
plt.barh(y_pos, performance, align='center', alpha=0.9)
plt.yticks(y_pos, objects)
plt.xlabel('Loss ')
plt.title('Model compared without hypertuning')
 
plt.show()