In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import dataset
data = pd.read_csv("housing.csv")

# shuffle dataset
data = data.sample(n=len(data))

# calculate ln of certain variables
data['total_rooms']= np.log(data['total_rooms']+1)
data['total_bedrooms']= np.log(data['total_bedrooms']+1)
data['population']= np.log(data['population']+1)
data['households']= np.log(data['households']+1)

# scale data using standard scaler
from sklearn.preprocessing import StandardScaler
data = pd.DataFrame(data)
columns_to_exclude = ['ocean_proximity']
scaler = StandardScaler()
scaled_data = data.copy()
scaled_data.loc[:, data.columns.difference(columns_to_exclude)] = scaler.fit_transform(data.loc[:, data.columns.difference(columns_to_exclude)])

# one hot vector encoding
scaled_data=scaled_data.join(pd.get_dummies(scaled_data.ocean_proximity)).drop(['ocean_proximity'], axis=1)

# fill missing values with median
scaled_data['total_bedrooms'] = scaled_data['total_bedrooms'].fillna(scaled_data['total_bedrooms'].median())


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# build the neural network model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential()



# kfold cross validation(10 folds)
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# mae and mse
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae_train_list, mse_train_list = [], []
mae_test_list, mse_test_list = [], []

# X y split
X = scaled_data.drop('median_house_value', axis=1)
y = scaled_data['median_house_value']

count = 1

# loop through each fold
for train_index, test_index in kf.split(X):
    print("Fold", count, ":")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    # convert bool values to float64
    X_train, X_test = X_train.astype('float64'), X_test.astype('float64')
    
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


    
    # create model layers
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    # compile the model
    model.compile(optimizer='adam', loss='mean_absolute_error')
    # train the model
    model.fit(X_train, y_train, epochs=2, batch_size=32, validation_split=0.1, verbose=1)


    
    # predict on the training set
    predictions_train = model.predict(X_train.values)
    # calculate mean absolute error
    mae_train = mean_absolute_error(y_train, predictions_train)
    print("Mean Absolute Error for TRAIN set:", mae_train)
    mae_train_list.append(mae_train)
    # calculate mean squared error
    mse_train = mean_squared_error(y_train, predictions_train)
    print("Mean Sqared Error for TRAIN set:", mse_train)
    mse_train_list.append(mse_train)
    # predict on the test set

    predictions_test = model.predict(X_test.values)
    # calculate mean absolute error
    mae_test = mean_absolute_error(y_test, predictions_test)
    print("Mean Absolute Error for TEST set:", mae_test)
    mae_test_list.append(mae_test)
    # calculate mean squared error
    mse_test = mean_squared_error(y_test, predictions_test)
    print("Mean Sqared Error for TEST set:", mse_test, "\n")
    mse_test_list.append(mse_test)
    
    count += 1

# print average values
print("Average MAE for TRAIN set:", np.mean(mae_train_list))
print("Average MSE for TRAIN set:", np.mean(mse_train_list))
print("Average MAE for TEST set:", np.mean(mae_test_list))
print("Average MSE for TEST set:", np.mean(mse_test_list))



Fold 1 :

Epoch 1/2

Epoch 2/2
Mean Absolute Error for TRAIN set: 0.3440267086993762
Mean Sqared Error for TRAIN set: 0.2700615113097561
Mean Absolute Error for TEST set: 0.33621228971201444
Mean Sqared Error for TEST set: 0.2515668261014695 

Fold 2 :
Epoch 1/2
Epoch 2/2
Mean Absolute Error for TRAIN set: 0.33173152109941867
Mean Sqared Error for TRAIN set: 0.24255549932780746
Mean Absolute Error for TEST set: 0.34924920098837536
Mean Sqared Error for TEST set: 0.25613842705092066 

Fold 3 :
Epoch 1/2
Epoch 2/2
Mean Absolute Error for TRAIN set: 0.3314987766123484
Mean Sqared Error for TRAIN set: 0.2589490553200706
Mean Absolute Error for TEST set: 0.3296317617340684
Mean Sqared Error for TEST set: 0.26490648302425013 

Fold 4 :
Epoch 1/2
Epoch 2/2
Mean Absolute Error for TRAIN set: 0.31115348585417907
Mean Sqared Error for TRAIN set: 0.22537691801556625
Mean Absolute Error for TEST set: 0.31297818454455295
Mean Sqared Error for TEST set: 0.23012722376250358 

Fold 5 :
Epoch 1/2
Epo