In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats
from scipy.stats import linregress
np.random.seed(1265)
from functools import reduce
from mpl_toolkits.basemap import Basemap
import geopandas as gpd
from shapely.geometry import Point, MultiPolygon, Polygon
from shapely.ops import unary_union
import os

###Machine Learning
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

##Supervised Learning
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb

##Deep learning session
import tensorflow as tf
from keras.layers import Dropout, BatchNormalization, Activation
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adagrad

np.random.seed(123)
tf.random.set_seed(123)

### Prepare the data

In [2]:
string_brackets_columns = ['resoFacts.patioAndPorchFeatures', 'resoFacts.waterSource', 'resoFacts.exteriorFeatures',
                           'resoFacts.interiorFeatures', 'resoFacts.communityFeatures', 'resoFacts.constructionMaterials',
                           'resoFacts.foundationDetails', 'resoFacts.utilities', 'resoFacts.appliances', 
                           'resoFacts.flooring', 'resoFacts.parkingFeatures',
                           'resoFacts.lotFeatures', 'resoFacts.fireplaceFeatures', 'resoFacts.laundryFeatures', 
                           'resoFacts.propertySubType', 'resoFacts.securityFeatures', 
                           'resoFacts.cooling', 'resoFacts.windowFeatures', 'resoFacts.heating', 'resoFacts.sewer', 
                           'resoFacts.poolFeatures']

string_comma_columns = ['resoFacts.architecturalStyle', 
                        'resoFacts.fencing', 'resoFacts.roofType']

t = string_brackets_columns + string_comma_columns
Selected_features = list(set(t) - set(['resoFacts.utilities', 'resoFacts.propertySubType']))

Year_list2022 = ['2022']
Month_list2022 = ['July', 'August', 'September', 'October', 'November', 'December']
Year_list2023 = ['2023']
Month_list2023 = ['January', 'February', 'March']

##This is a dictionary that stores the original data everymonth
original_data = dict()

##This is a dictionary that store the datasets for every month
monthly_data_dict1 = dict()

##This is a dictionary that store datasets to run analysis
monthly_data_dict2 = dict()
path_abs = "C:\\Users\\Khoatruong\\DATA365\\House Capstone Project\\Data cleaning\\Houses_categorical_property\\"

##Remove zipcode, city, county, SchoolDistrict since longitude and latitude will do the part

def read_house_data(Monthlist, Yearlist, original_data, monthly_data_dict1, monthly_data_dict2):
    for Year in Yearlist:
        for Month in Monthlist:
            main_df = pd.read_csv("C:\\Users\\Khoatruong\\DATA365\\House Capstone Project\\Data cleaning\\Clean_data_House_properties\\" + Month + '_' + Year + '\\houses_properties_' + Month + '.csv')
            cols = main_df.columns.tolist()

            ##Rename some columns
            for c in cols:
                if 'resoFacts.' in c:
                    feature = c.split('.')[1]
                    main_df[feature] = main_df[c]
                    main_df.drop(labels = [c], axis = 1, inplace = True)

            ##Drop unimportant features 
            main_df.drop(labels = ['streetAddress', 'countyId', 'hasCooling', 'hasHeating'], axis = 1, inplace = True)
            main_df.reset_index(drop = True, inplace = True)

            ##Join the categorical dataframes
            for category in Selected_features:
                ft = category.split('.')[1]
                name = 'houses' + '_' + ft + '.csv'
                df_cat = pd.read_csv(path_abs + Month + '_' + Year + '\\' + name)
                main_df = pd.merge(main_df, df_cat, how = 'left', on = 'zpid')

            string_list = ['city', 'county', 'SchoolDistrict', 'levels']
            for string in string_list:
                main_df[string] = main_df[string].str.lower()
            original_data[Month + '_' + Year] = main_df

            ##Need the zpid (zillow id) to keep track of the data
            #main_df.drop(labels = ['zpid'], axis = 1, inplace = True)
            monthly_data_dict1[Month + '_' + Year] = main_df

            remove_features = ['zipcode', 'city', 'SchoolDistrict', 'annualHomeownersInsurance']
            main_df2 = main_df.copy()
            main_df2.drop(labels = remove_features, axis = 1, inplace = True)
            monthly_data_dict2[Month + '_' + Year] = main_df2

In [3]:
read_house_data(Month_list2022, Year_list2022, original_data, monthly_data_dict1, monthly_data_dict2)
read_house_data(Month_list2023, Year_list2023, original_data, monthly_data_dict1, monthly_data_dict2)

In [4]:
for Month, df in monthly_data_dict1.items():
    monthly_data_dict1[Month]['zipcode'] = monthly_data_dict1[Month]['zipcode'].astype('int64')

for Month, df in monthly_data_dict1.items():
    monthly_data_dict1[Month]['price/livingsqft'] = monthly_data_dict1[Month]['price'] / monthly_data_dict1[Month]['livingAreaValue']
    monthly_data_dict1[Month]['price/lotsqft'] = monthly_data_dict1[Month]['price'] / monthly_data_dict1[Month]['lotArea']
    monthly_data_dict1[Month]['liv/lot_ratio'] = monthly_data_dict1[Month]['livingAreaValue'] / monthly_data_dict1[Month]['lotArea']
    monthly_data_dict1[Month]['living_price'] = df.apply(lambda row: row['price'] if row['liv/lot_ratio'] >= 1 else row['liv/lot_ratio']*row['price'], axis=1)
    levels_df = pd.get_dummies(monthly_data_dict1[Month]['levels'], prefix = 'levels')
    df = pd.concat([df.copy(), levels_df], axis=1)
    df = df.copy().drop('levels', axis = 1)
    ###Looking at a more dense location
    df = df.copy().drop(df[(df['lotArea'] == 0) | (df['lotArea'] > 100000) | (df['price/lotsqft'] > 1000) 
                           | (df['liv/lot_ratio'] > 2)].index)
    df = df.reset_index(drop = True)
    monthly_data_dict1[Month] = df

In [5]:
##Need to categorize levels
True_False_features = ['isSeniorCommunity', 'hasAssociation', 'hasPrivatePool', 'hasGarage',
                       'hasAttachedGarage', 'hasCarport', 'hasSpa', 'hasFireplace', 'isNewConstruction']

Numerical_features = ['monthlyHoaFee', 'liv/lot_ratio', 'bedrooms', 'carportSpaces', 'garageSpaces', 
                      'coveredSpaces', 'parking', 'bathroomsHalf', 'bathroomsFull', 'fireplaces']

##levels, laundry features, flooring features, construction features, exterior features, interior features, appliances,
##foundations, lotFeatures, securityFeatures, sewer of the house.

Categorical_features = ['levels_one', 'levels_one and one half', 'levels_two', 'levels_three or more',
                        'levels_multi/split', 'laundryFeatures_none', 'laundryFeatures_electricdryerhookup',
                        'laundryFeatures_fullsizew/darea', 'laundryFeatures_washerhookup', 'laundryFeatures_utilityroom',
                        'laundryFeatures_other', 'flooring_carpet', 'flooring_laminate', 'flooring_vinyl',
                        'flooring_ceramictile', 'flooring_hardwood', 'flooring_other', 'flooring_luxuryvinylplank',
                        'flooring_tile', 'flooring_wood', 'constructionMaterials_siding', 'constructionMaterials_wood',
                        'constructionMaterials_brick', 'constructionMaterials_other', 'constructionMaterials_frame',
                        'constructionMaterials_fibercement', 'constructionMaterials_rock/stone', 'exteriorFeatures_coveredpatio/porch',
                        'exteriorFeatures_storage', 'exteriorFeatures_other', 'exteriorFeatures_raingutters',
                        'exteriorFeatures_lighting', 'exteriorFeatures_privateyard', 'interiorFeatures_granitecounters',
                        'interiorFeatures_highspeedinternetavailable', 'interiorFeatures_other', 'interiorFeatures_cabletvavailable',
                        'interiorFeatures_eat-inkitchen', 'interiorFeatures_pantry', 'interiorFeatures_openfloorplan',
                        'interiorFeatures_kitchenisland', 'interiorFeatures_vaultedceiling(s)', 
                        'interiorFeatures_walk-incloset(s)', 'interiorFeatures_built-infeatures', 'interiorFeatures_decorativelighting',
                        'interiorFeatures_smarthomesystem', 'interiorFeatures_doublevanity', 'interiorFeatures_flatscreenwiring',
                        'interiorFeatures_chandelier', 'interiorFeatures_soundsystemwiring', 'interiorFeatures_wetbar',
                        'appliances_electricrange', 'appliances_refrigerator', 'appliances_gasrange', 'appliances_dishwasher',
                        'appliances_gasoven', 'appliances_gaswaterheater', 'appliances_electricoven', 'appliances_microwave',
                        'appliances_electriccooktop', 'appliances_electricwaterheater', 'appliances_ventedexhaustfan',
                        'appliances_disposal', 'appliances_plumbedforgasinkitchen', 'appliances_tanklesswaterheater',
                        'appliances_gascooktop', 'appliances_doubleoven', 'appliances_convectionoven', 'appliances_built-ingasrange', 
                        'appliances_other', 'foundationDetails_pillar/post/pier', 'foundationDetails_slab',
                        'foundationDetails_other', 'lotFeatures_lrg.backyardgrass', 'lotFeatures_acreage',
                        'lotFeatures_cornerlot', 'lotFeatures_fewtrees', 'lotFeatures_none', 'lotFeatures_interiorlot',
                        'lotFeatures_subdivided', 'lotFeatures_cul-de-sac', 'lotFeatures_landscaped',
                        'lotFeatures_sprinklersystem', 'lotFeatures_other', 'lotFeatures_manytrees',
                        'securityFeatures_none', 'securityFeatures_smokedetector(s)', 'securityFeatures_carbonmonoxidedetector(s)',
                        'securityFeatures_firealarm', 'securityFeatures_securitysystem',
                        'securityFeatures_securitysystemowned', 'securityFeatures_burglar', 'securityFeatures_prewired',
                        'securityFeatures_other', 'sewer_citysewer', 'sewer_aerobicseptic', 'sewer_septic',
                        'sewer_other']

## Redesign the location data
redesign the location data due to new parameters

In [6]:
location_cols = ['zpid', 'latitude', 'longitude', 'county', 'SchoolDistrict', 'zipcode', 
                 'liv/lot_ratio', 'price/lotsqft', 'price/livingsqft']
monthly_location2 = dict()
for Month, df in monthly_data_dict1.items():
    ##Store the location data into this dictionary
    df_location = df.copy()[location_cols]
    monthly_location2[Month] = df_location

In [7]:
shape_df2 = gpd.read_file('ALL_US_county/tl_rd22_us_county.shp')
shape_df2['NAME'] = shape_df2['NAME'].str.lower()
shape_df2['INTPTLAT'] = shape_df2['INTPTLAT'].astype('float64')
shape_df2['INTPTLON'] = shape_df2['INTPTLON'].astype('float64')

shape_df3 = gpd.read_file('Texas_SchoolDistrict/tl_rd22_48_unsd.shp')
shape_df3['NAME'] = shape_df3['NAME'].str.lower().replace('independent school district', 'isd', regex = True)
shape_df3['INTPTLAT'] = shape_df3['INTPTLAT'].astype('float64')
shape_df3['INTPTLON'] = shape_df3['INTPTLON'].astype('float64')

shape_df4 = gpd.read_file('ALL_US_zipcode/tl_rd22_us_zcta520.shp')
shape_df4['ZCTA5CE20'] = shape_df4['ZCTA5CE20'].astype('int64')
shape_df4['GEOID20'] = shape_df4['GEOID20'].astype('int64')
shape_df4['INTPTLAT20'] = shape_df4['INTPTLAT20'].astype('float64')
shape_df4['INTPTLON20'] = shape_df4['INTPTLON20'].astype('float64')
shape_df4['INTPTLAT'] = shape_df4['INTPTLAT20']
shape_df4['INTPTLON'] = shape_df4['INTPTLON20']

In [8]:
for Month, df in monthly_location2.items():
    all_counties_list = np.sort(df['county'].unique())
    counties_location_df = shape_df2.copy()
    counties_location_df = counties_location_df[(counties_location_df['NAME'].isin(all_counties_list)) & 
                                                (counties_location_df['GEOID'].str.contains('48'))].reset_index(drop = True)
    counties_location_df['county_lat'] = counties_location_df['INTPTLAT']
    counties_location_df['county_long'] = counties_location_df['INTPTLON']
    counties_location_df['county'] = counties_location_df['NAME']
    
    all_SD_list = np.sort(df['SchoolDistrict'].unique())
    SD_location_df = shape_df3.copy()
    SD_location_df = SD_location_df[SD_location_df['NAME'].isin(all_SD_list) &
                                   (SD_location_df['INTPTLON'] >= -98.5) & 
                                   (SD_location_df['INTPTLON'] <= -95.8) &
                                   (SD_location_df['INTPTLAT'] >= 32) &
                                   (SD_location_df['INTPTLAT'] <= 33.70)].reset_index(drop = True)
    SD_location_df['SD_lat'] = SD_location_df['INTPTLAT']
    SD_location_df['SD_long'] = SD_location_df['INTPTLON']
    SD_location_df['SchoolDistrict'] = SD_location_df['NAME']
    
    all_zipcode_list = np.sort(df['zipcode'].unique())
    zipcode_location_df = shape_df4.copy()
    zipcode_location_df = zipcode_location_df[zipcode_location_df['ZCTA5CE20'].isin(all_zipcode_list) &
                                             (zipcode_location_df['INTPTLON20'] >= -98.5) & 
                                             (zipcode_location_df['INTPTLON20'] <= -95.8) &
                                             (zipcode_location_df['INTPTLAT20'] >= 32) &
                                             (zipcode_location_df['INTPTLAT20'] <= 33.70)].reset_index(drop = True)
    zipcode_location_df['zipcode_lat'] = zipcode_location_df['INTPTLAT']
    zipcode_location_df['zipcode_long'] = zipcode_location_df['INTPTLON']
    zipcode_location_df['zipcode'] = zipcode_location_df['ZCTA5CE20']
    
    merge_df = pd.merge(df, counties_location_df.copy()[['county', 'county_lat', 'county_long']], on=['county'], how='left')
    merge_df = pd.merge(merge_df, SD_location_df.copy()[['SchoolDistrict', 'SD_lat', 'SD_long']], on=['SchoolDistrict'], how='left')
    merge_df = pd.merge(merge_df, zipcode_location_df.copy()[['zipcode', 'zipcode_lat', 'zipcode_long']], on=['zipcode'], how='left')
    
    new_df = merge_df.copy()[['zpid', 'latitude', 'longitude', 'county_lat', 
                              'county_long', 'SD_lat', 'SD_long', 'zipcode_lat', 'zipcode_long', 
                              'price/livingsqft', 'price/lotsqft', 'liv/lot_ratio']]
    
    ###Check for null coords, if they are null, fill them with the coordinates of the houses
    new_df['SD_lat'].fillna(new_df['latitude'], inplace = True)
    new_df['SD_long'].fillna(new_df['longitude'], inplace = True)
    
    new_df['zipcode_lat'].fillna(new_df['latitude'], inplace = True)
    new_df['zipcode_long'].fillna(new_df['longitude'], inplace = True)
    
    new_df['county_lat'].fillna(new_df['latitude'], inplace = True)
    new_df['county_long'].fillna(new_df['longitude'], inplace = True)
    Monthly_path = 'Location_data3/' + Month
    if not os.path.exists(Monthly_path):
        os.makedirs(Monthly_path)
    
    new_df.to_csv(Monthly_path + '/' + Month + '_houses_location.csv', index = False)

In [9]:
###Location df contains more samples since the rule lotArea < 100000 is not applied here
Month_list = ['July_2022', 'August_2022', 'September_2022', 'October_2022', 'November_2022',
              'December_2022', 'January_2023', 'February_2023', 'March_2023']
monthly_location_dict1 = dict()

for month in Month_list:
    ###Read in all the datasets
    Month_path = 'Location_data3/' + month + '/' + month + '_houses_location.csv'
    df_location = pd.read_csv(Month_path)
    monthly_location_dict1[month] = df_location

In [10]:
def transform_data(df, True_False_features, Numerical_features, Categorical_features):
    df_transformed = df.copy()
    for ft in True_False_features:
        df_transformed.loc[df_transformed[ft] == True, ft] = 1
        df_transformed.loc[df_transformed[ft] == False, ft] = 0
        df_transformed[ft] = df_transformed[ft].astype(int)
    return df_transformed[True_False_features + Numerical_features + Categorical_features + ['living_price']]

In [11]:
def prep_combination_data(splits, scaler, original_df, df_model1, df_model2, Numerical_features):
    data1 = df_model1.to_numpy()
    data2 = df_model2.to_numpy()
    cv = KFold(n_splits = splits, shuffle = True, random_state = 0)
    cv.split(original_df)

    train_set_dict1 = dict()
    train_target_dict1 = dict()
    scaled_train_set1 = dict()
    scaled_train_target1 = dict()

    test_set_dict1 = dict()
    test_target_dict1 = dict()
    scaled_test_set1 = dict()
    scaled_test_target1 = dict()
    
    train_set_dict2 = dict()
    train_target_dict2 = dict()
    scaled_train_set2 = dict()
    scaled_train_target2 = dict()

    test_set_dict2 = dict()
    test_target_dict2 = dict()
    scaled_test_set2 = dict()
    scaled_test_target2 = dict()

    ###Create 2 different unique dictionary to track the prediction of real price
    original_train = dict()
    original_test = dict()
    
    ###Store K folds data into dictionary
    for i, (train_index, test_index) in enumerate(cv.split(original_df)):
        ###1 means location data
        train_set1 = df_model1.iloc[train_index].iloc[:, :-1]
        train_target1 = df_model1.iloc[train_index].iloc[:, -1]
        train_set_dict1['Fold ' + str(i)] = train_set1.copy()
        train_target_dict1['Fold ' + str(i)] = train_target1.copy()

        test_set1 = df_model1.iloc[test_index].iloc[:, :-1]
        test_target1 = df_model1.iloc[test_index].iloc[:, -1]
        test_set_dict1['Fold ' + str(i)] = test_set1.copy()
        test_target_dict1['Fold ' + str(i)] = test_target1.copy()
        
        ###2 means properties data
        train_set2 = df_model2.iloc[train_index].iloc[:, :-1]
        train_target2 = df_model2.iloc[train_index].iloc[:, -1]
        train_set_dict2['Fold ' + str(i)] = train_set2.copy()
        train_target_dict2['Fold ' + str(i)] = train_target2.copy()

        test_set2 = df_model2.iloc[test_index].iloc[:, :-1]
        test_target2 = df_model2.iloc[test_index].iloc[:, -1]
        test_set_dict2['Fold ' + str(i)] = test_set2.copy()
        test_target_dict2['Fold ' + str(i)] = test_target2.copy()
        
        ###Keeping the liv/lot_ratio, lot_area, living area
        original_train['Fold ' + str(i)] = original_df.iloc[train_index][['zpid', 'livingAreaValue', 'lotArea', 'liv/lot_ratio']]
        original_test['Fold ' + str(i)] = original_df.iloc[test_index][['zpid', 'livingAreaValue', 'lotArea', 'liv/lot_ratio']]
        
        ###for location data
        the_scaler1 = scaler
        scaled_train_set1['Fold ' + str(i)] = the_scaler1.fit_transform(train_set1)
        scaled_train_target1['Fold ' + str(i)] = np.log(train_target_dict1['Fold ' + str(i)]).to_numpy().reshape(-1, 1)
        scaled_test_set1['Fold ' + str(i)] = the_scaler1.transform(test_set1)
        scaled_test_target1['Fold ' + str(i)] = np.log(test_target_dict1['Fold ' + str(i)]).to_numpy().reshape(-1, 1)
        
        ###for properties data
        the_scaler2 = scaler
        scaled_train2 = train_set2.copy()
        scaled_test2 = test_set2.copy()
        num_transformer = ColumnTransformer(transformers=[('num', MinMaxScaler(), Numerical_features)])
        scaled_train2[Numerical_features] = num_transformer.fit_transform(train_set2.copy()[Numerical_features])
        scaled_test2[Numerical_features] = num_transformer.transform(test_set2.copy()[Numerical_features])
        scaled_train_set2['Fold ' + str(i)] = scaled_train2
        scaled_train_target2['Fold ' + str(i)] = np.log(train_target_dict2['Fold ' + str(i)]).to_numpy().reshape(-1, 1)
        scaled_test_set2['Fold ' + str(i)] = scaled_test2
        scaled_test_target2['Fold ' + str(i)] = np.log(test_target_dict2['Fold ' + str(i)]).to_numpy().reshape(-1, 1)
        
    return (train_set_dict1, train_target_dict1, scaled_train_set1, scaled_train_target1,
            test_set_dict1, test_target_dict1, scaled_test_set1, scaled_test_target1,
            train_set_dict2, train_target_dict2, scaled_train_set2, scaled_train_target2,
            test_set_dict2, test_target_dict2, scaled_test_set2, scaled_test_target2, original_train, original_test)

## Try this on February dataset

In [12]:
df_feb2023 = monthly_data_dict1['February_2023']
original_df = df_feb2023.copy()
df_model1 = monthly_location_dict1['February_2023'].copy()[['latitude', 'longitude', 'county_lat', 'county_long', 
                                                            'liv/lot_ratio', 'SD_lat', 'SD_long', 'zipcode_lat', 
                                                            'zipcode_long', 'price/lotsqft']]
df_model2 = transform_data(df_feb2023.copy(), True_False_features, Numerical_features, Categorical_features)

splits = 5
scaler = MinMaxScaler()
all_dicts = prep_combination_data(splits, scaler, original_df, df_model1, df_model2, Numerical_features)
train_set_dict1 = all_dicts[0]
train_target_dict1 = all_dicts[1]
scaled_train_set1 = all_dicts[2]
scaled_train_target1 = all_dicts[3]
test_set_dict1 = all_dicts[4]
test_target_dict1 = all_dicts[5]
scaled_test_set1 = all_dicts[6]
scaled_test_target1 = all_dicts[7]

train_set_dict2 = all_dicts[8]
train_target_dict2 = all_dicts[9]
scaled_train_set2 = all_dicts[10]
scaled_train_target2 = all_dicts[11]
test_set_dict2 = all_dicts[12]
test_target_dict2 = all_dicts[13]
scaled_test_set2 = all_dicts[14]
scaled_test_target2 = all_dicts[15]

original_train = all_dicts[16]
original_test = all_dicts[17]

#### Location prediction

In [13]:
def ann_location_generator(input_shape):
    T = input_shape
    i = Input(shape = [T,])
    x = Dense(2000)(i)
    x = Activation('relu')(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    output = Dense(1)(x)
    model = Model(inputs = i, outputs = output)
    
    return model

In [14]:
Test_R2_list = list()
Test_adjusted_R2_list = list()
Train_R2_list = list()
rmse_list = list()
mae_list = list()

np.random.seed(123)
tf.random.set_seed(123)

input_shape = train_set_dict1['Fold 0'].shape[1]
initial_learning_rate = 0.001
decay_steps = 990
decay_rate = 0.85
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate, decay_steps, 
                                                             decay_rate, staircase=True)
Adam = tf.keras.optimizers.Adam(learning_rate = lr_schedule)
model1 = ann_location_generator(input_shape)
model1.compile(loss = 'mean_squared_error', optimizer = Adam, metrics = ['mse'])

for i in range(5):
    train_set1 = scaled_train_set1['Fold ' + str(i)]
    train_target1 = scaled_train_target1['Fold ' + str(i)]
    test_set1 = scaled_test_set1['Fold ' + str(i)]
    test_target1 = scaled_test_target1['Fold ' + str(i)]

    check_point = ModelCheckpoint('Location_ANN_vars/model1_adam_ann_' + 'Fold ' + str(i) + '.h5', 
                                   monitor = 'val_mse', save_best_only = True)
    model1.fit(train_set1, train_target1, epochs = 80, validation_data = (test_set1, test_target1), 
               batch_size = 32, callbacks = [check_point], verbose = 0)
    model_allvars = tf.keras.models.load_model('Location_ANN_vars/model1_adam_ann_' + 'Fold ' + str(i) + '.h5')
    train_observed = train_target_dict1['Fold ' + str(i)].to_numpy().reshape(-1, 1)
    train_predicted = np.exp(model_allvars.predict(train_set1))
    original_train['Fold ' + str(i)]['predicted_price/lotsqft'] = train_predicted
    original_train['Fold ' + str(i)]['actual_price/lotsqft'] = train_observed
    original_train['Fold ' + str(i)]['actual_price'] =  original_train['Fold ' + str(i)]['lotArea'] * original_train['Fold ' + str(i)]['actual_price/lotsqft']
    original_train['Fold ' + str(i)]['predicted_price1'] = original_train['Fold ' + str(i)]['lotArea'] * original_train['Fold ' + str(i)]['predicted_price/lotsqft']
    train_r_squared = r2_score(original_train['Fold ' + str(i)]['actual_price'], original_train['Fold ' + str(i)]['predicted_price1'])
    
    ###For the test set
    test_observed = test_target_dict1['Fold ' + str(i)].to_numpy().reshape(-1, 1)
    test_predicted = np.exp(model_allvars.predict(test_set1))
    original_test['Fold ' + str(i)]['predicted_price/lotsqft'] = test_predicted
    original_test['Fold ' + str(i)]['actual_price/lotsqft'] = test_observed
    original_test['Fold ' + str(i)]['actual_price'] =  original_test['Fold ' + str(i)]['lotArea'] * original_test['Fold ' + str(i)]['actual_price/lotsqft']
    original_test['Fold ' + str(i)]['predicted_price1'] = original_test['Fold ' + str(i)]['lotArea'] * original_test['Fold ' + str(i)]['predicted_price/lotsqft']    
    test_r_squared = r2_score(original_test['Fold ' + str(i)]['actual_price'], original_test['Fold ' + str(i)]['predicted_price1'])
    
    r2 = test_r_squared
    n = test_observed.shape[0]
    p = train_set1.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
    mse = mean_squared_error(original_test['Fold ' + str(i)]['actual_price'], original_test['Fold ' + str(i)]['predicted_price1'])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(original_test['Fold ' + str(i)]['actual_price'], original_test['Fold ' + str(i)]['predicted_price1'])

    Test_R2_list.append(r2)
    Test_adjusted_R2_list.append(adjusted_r2)
    Train_R2_list.append(train_r_squared)
    rmse_list.append(rmse)
    mae_list.append(mae)
    
    print("Fold " + str(i + 1) + " Test R-squared score:", np.round(r2, 4))
    print("Fold " + str(i + 1) + " Test Adjusted R-squared score:", np.round(adjusted_r2, 4))
    print("Fold " + str(i + 1) + " RMSE:", np.round(rmse, 4))
    print("Fold " + str(i + 1) + " MAE:", np.round(mae, 4))
    print()
    
dif = np.round(sum(Train_R2_list)/len(Train_R2_list), 4) - np.round(sum(Test_R2_list)/len(Test_R2_list), 4)
dif = np.round(dif * 100, 2)
print("The average Train R-squared : ", np.round(sum(Train_R2_list)/len(Train_R2_list), 4))
print("The average Test R-squared : ", np.round(sum(Test_R2_list)/len(Test_R2_list), 4))
print("The average Test adjusted R-squared: ", np.round(sum(Test_adjusted_R2_list)/len(Test_adjusted_R2_list), 4))
print("The average deviation from actual value (RMSE): ", np.round(sum(rmse_list)/len(rmse_list), 4))
print("The average deviation from actual value (MAE): ", np.round(sum(mae_list)/len(mae_list), 4))
print("Difference between Train vs. Test R-squared: ", dif, "%")

Fold 1 Test R-squared score: 0.8031
Fold 1 Test Adjusted R-squared score: 0.8022
Fold 1 RMSE: 119494.2698
Fold 1 MAE: 70722.5553

Fold 2 Test R-squared score: 0.8228
Fold 2 Test Adjusted R-squared score: 0.8221
Fold 2 RMSE: 112191.4445
Fold 2 MAE: 66855.8954

Fold 3 Test R-squared score: 0.8397
Fold 3 Test Adjusted R-squared score: 0.839
Fold 3 RMSE: 109303.4241
Fold 3 MAE: 64832.9568

Fold 4 Test R-squared score: 0.8339
Fold 4 Test Adjusted R-squared score: 0.8332
Fold 4 RMSE: 109745.4366
Fold 4 MAE: 65940.5357

Fold 5 Test R-squared score: 0.8028
Fold 5 Test Adjusted R-squared score: 0.802
Fold 5 RMSE: 123023.6377
Fold 5 MAE: 71196.3075

The average Train R-squared :  0.8194
The average Test R-squared :  0.8205
The average Test adjusted R-squared:  0.8197
The average deviation from actual value (RMSE):  114751.6425
The average deviation from actual value (MAE):  67909.6501
Difference between Train vs. Test R-squared:  -0.11 %


#### Properties prediction

In [16]:
def ann_properties_generator(input_shape):
    T = input_shape
    i = Input(shape = [T,])
    x = Dense(80)(i)
    x = Activation('relu')(x)
    x = Dense(60)(x)
    x = Activation('relu')(x)
    x = Dense(20)(x)
    x = Activation('relu')(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    output = Dense(1)(x)
    model = Model(inputs = i, outputs = output)
    
    return model

In [17]:
Test_R2_list = list()
Test_adjusted_R2_list = list()
Train_R2_list = list()
rmse_list = list()
mae_list = list()

np.random.seed(123)
tf.random.set_seed(123)

input_shape = train_set_dict2['Fold 0'].shape[1]
model2 = ann_properties_generator(input_shape)
model2.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mse'])

for i in range(5):
    train_set2 = scaled_train_set2['Fold ' + str(i)]
    train_target2 = scaled_train_target2['Fold ' + str(i)]
    test_set2 = scaled_test_set2['Fold ' + str(i)]
    test_target2 = scaled_test_target2['Fold ' + str(i)]

    check_point = ModelCheckpoint('Properties_ANN_vars/model2_adam_ann_' + 'Fold ' + str(i) + '.h5', 
                                   monitor = 'val_mse', save_best_only = True)
    model2.fit(train_set2, train_target2, epochs = 80, validation_data = (test_set2, test_target2), 
               batch_size = 32, callbacks = [check_point], verbose = 0)
    model_allvars = tf.keras.models.load_model('Properties_ANN_vars/model2_adam_ann_' + 'Fold ' + str(i) + '.h5')
    train_observed = train_target_dict2['Fold ' + str(i)].to_numpy().reshape(-1, 1)
    train_predicted = np.exp(model_allvars.predict(train_set2))
    
    ###We have calculated the actual price already with lot * price/lot
    original_train['Fold ' + str(i)]['actual_living_price'] = train_observed
    original_train['Fold ' + str(i)]['predicted_living_price'] = train_predicted
    original_train['Fold ' + str(i)]['lot/liv_ratio'] = original_train['Fold ' + str(i)]['liv/lot_ratio'].apply(lambda x: 1/x)
    original_train['Fold ' + str(i)]['predicted_price2'] = original_train['Fold ' + str(i)].apply(lambda row: row['predicted_living_price'] if row['liv/lot_ratio'] >= 1 
                                                                                                                                            else row['lot/liv_ratio'] * row['predicted_living_price'], axis=1)
    train_r_squared = r2_score(original_train['Fold ' + str(i)]['actual_price'], original_train['Fold ' + str(i)]['predicted_price2'])
    
    ###For the test set
    test_observed = test_target_dict2['Fold ' + str(i)].to_numpy().reshape(-1, 1)
    test_predicted = np.exp(model_allvars.predict(test_set2))
    original_test['Fold ' + str(i)]['actual_living_price'] = test_observed
    original_test['Fold ' + str(i)]['predicted_living_price'] = test_predicted
    original_test['Fold ' + str(i)]['lot/liv_ratio'] = original_test['Fold ' + str(i)]['liv/lot_ratio'].apply(lambda x: 1/x)
    original_test['Fold ' + str(i)]['predicted_price2'] = original_test['Fold ' + str(i)].apply(lambda row: row['predicted_living_price'] if row['liv/lot_ratio'] >= 1 
                                                                                                                                          else row['lot/liv_ratio'] * row['predicted_living_price'], axis=1)    
    test_r_squared = r2_score(original_test['Fold ' + str(i)]['actual_price'], original_test['Fold ' + str(i)]['predicted_price2'])
    
    r2 = test_r_squared
    n = test_observed.shape[0]
    p = train_set2.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
    mse = mean_squared_error(original_test['Fold ' + str(i)]['actual_price'], original_test['Fold ' + str(i)]['predicted_price2'])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(original_test['Fold ' + str(i)]['actual_price'], original_test['Fold ' + str(i)]['predicted_price2'])

    Test_R2_list.append(r2)
    Test_adjusted_R2_list.append(adjusted_r2)
    Train_R2_list.append(train_r_squared)
    rmse_list.append(rmse)
    mae_list.append(mae)
    
    print("Fold " + str(i + 1) + " Test R-squared score:", np.round(r2, 4))
    print("Fold " + str(i + 1) + " Test Adjusted R-squared score:", np.round(adjusted_r2, 4))
    print("Fold " + str(i + 1) + " RMSE:", np.round(rmse, 4))
    print("Fold " + str(i + 1) + " MAE:", np.round(mae, 4))
    print()
    
dif = np.round(sum(Train_R2_list)/len(Train_R2_list), 4) - np.round(sum(Test_R2_list)/len(Test_R2_list), 4)
dif = np.round(dif * 100, 2)
print("The average Train R-squared : ", np.round(sum(Train_R2_list)/len(Train_R2_list), 4))
print("The average Test R-squared : ", np.round(sum(Test_R2_list)/len(Test_R2_list), 4))
print("The average Test adjusted R-squared: ", np.round(sum(Test_adjusted_R2_list)/len(Test_adjusted_R2_list), 4))
print("The average deviation from actual value (RMSE): ", np.round(sum(rmse_list)/len(rmse_list), 4))
print("The average deviation from actual value (MAE): ", np.round(sum(mae_list)/len(mae_list), 4))
print("Difference between Train vs. Test R-squared: ", dif, "%")

Fold 1 Test R-squared score: 0.5652
Fold 1 Test Adjusted R-squared score: 0.5395
Fold 1 RMSE: 177571.4983
Fold 1 MAE: 107376.6296

Fold 2 Test R-squared score: 0.8183
Fold 2 Test Adjusted R-squared score: 0.8075
Fold 2 RMSE: 113634.0453
Fold 2 MAE: 71772.0958

Fold 3 Test R-squared score: 0.8674
Fold 3 Test Adjusted R-squared score: 0.8595
Fold 3 RMSE: 99417.2532
Fold 3 MAE: 67334.0395

Fold 4 Test R-squared score: 0.9124
Fold 4 Test Adjusted R-squared score: 0.9072
Fold 4 RMSE: 79704.8538
Fold 4 MAE: 55698.3833

Fold 5 Test R-squared score: 0.9249
Fold 5 Test Adjusted R-squared score: 0.9205
Fold 5 RMSE: 75904.1603
Fold 5 MAE: 51268.9058

The average Train R-squared :  0.8918
The average Test R-squared :  0.8176
The average Test adjusted R-squared:  0.8069
The average deviation from actual value (RMSE):  109246.3622
The average deviation from actual value (MAE):  70690.0108
Difference between Train vs. Test R-squared:  7.42 %


In [18]:
n_folds = 5
for i in range(n_folds):
    path_train = 'Cross_val_df/Train/'
    path_test = 'Cross_val_df/Test/'
    original_train['Fold ' + str(i)].to_csv(path_train + 'original_train_result_Fold '+ str(i) + '.csv', index = True)
    original_test['Fold ' + str(i)].to_csv(path_test + 'original_test_result_Fold ' + str(i) + '.csv', index = True)

In [19]:
original_train = dict()
original_test = dict()

for i in range(5):
    original_train['Fold ' + str(i)] = pd.read_csv('Cross_val_df/Train/original_train_result_Fold '+ str(i) + '.csv', index_col = 0)
    original_test['Fold ' + str(i)] = pd.read_csv('Cross_val_df/Test/original_test_result_Fold '+ str(i) + '.csv', index_col = 0)

##### Price prediction 

In [20]:
def price_ml(model, original_train, original_test, verbose = 0):
    Test_R2_list = list()
    Test_adjusted_R2_list = list()
    Train_R2_list = list()
    rmse_list = list()
    mae_list = list()
    
    columns = ['livingAreaValue', 'lotArea', 'liv/lot_ratio', 'predicted_price/lotsqft', 'predicted_living_price',
               'predicted_price1', 'predicted_price2']
    for i in range(5):
        train_set = original_train['Fold ' + str(i)].copy()[columns].to_numpy()
        train_target = original_train['Fold ' + str(i)].copy()['actual_price'].to_numpy()
        test_set = original_test['Fold ' + str(i)].copy()[columns].to_numpy()
        test_target = original_test['Fold ' + str(i)].copy()['actual_price'].to_numpy()
        model.fit(train_set, train_target)
        
        train_predicted = model.predict(train_set)
        original_train['Fold ' + str(i)]['predicted_price_comb'] = train_predicted
        train_r_squared = model.score(train_set, train_target)
        
        test_observed = test_target
        test_predicted = model.predict(test_set)
        original_test['Fold ' + str(i)]['predicted_price_comb'] = test_predicted
        r2 = r2_score(test_observed, test_predicted)
        n = test_observed.shape[0]
        p = train_set.shape[1]
        adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
        mse = mean_squared_error(test_observed, test_predicted)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(test_observed, test_predicted)

        Test_R2_list.append(r2)
        Test_adjusted_R2_list.append(adjusted_r2)
        Train_R2_list.append(train_r_squared)
        rmse_list.append(rmse)
        mae_list.append(mae)
        
        if verbose == 1:
            print("Fold " + str(i + 1) + " Test R-squared score:", np.round(r2, 4))
            print("Fold " + str(i + 1) + " Test Adjusted R-squared score:", np.round(adjusted_r2, 4))
            print("Fold " + str(i + 1) + " RMSE:", np.round(rmse, 4))
            print("Fold " + str(i + 1) + " MAE:", np.round(mae, 4))
            print()
    dif = np.round(sum(Train_R2_list)/len(Train_R2_list), 4) - np.round(sum(Test_R2_list)/len(Test_R2_list), 4)
    dif = np.round(dif * 100, 2)
    if verbose == 1:
        print("The average Train R-squared : ", np.round(sum(Train_R2_list)/len(Train_R2_list), 4))
        print("The average Test R-squared : ", np.round(sum(Test_R2_list)/len(Test_R2_list), 4))
        print("The average Test adjusted R-squared: ", np.round(sum(Test_adjusted_R2_list)/len(Test_adjusted_R2_list), 4))
        print("The average deviation from actual value (RMSE): ", np.round(sum(rmse_list)/len(rmse_list), 4))
        print("The average deviation from actual value (MAE): ", np.round(sum(mae_list)/len(mae_list), 4))
        print("Difference between Train vs. Test R-squared: ", dif, "%")
    avg_Train_R2 = np.round(sum(Train_R2_list)/len(Train_R2_list), 4)
    avg_Test_R2 = np.round(sum(Test_R2_list)/len(Test_R2_list), 4)
    avg_Test_adjusted_R2 = np.round(sum(Test_adjusted_R2_list)/len(Test_adjusted_R2_list), 4)
    avg_Test_RMSE = np.round(sum(rmse_list)/len(rmse_list), 4)
    avg_Test_MAE = np.round(sum(mae_list)/len(mae_list), 4)
    return (avg_Train_R2, avg_Test_R2, avg_Test_adjusted_R2, avg_Test_RMSE, avg_Test_MAE, dif)

In [21]:
hgbr = HistGradientBoostingRegressor()
_ = price_ml(hgbr, original_train, original_test, verbose = 1)

Fold 1 Test R-squared score: 0.7719
Fold 1 Test Adjusted R-squared score: 0.7712
Fold 1 RMSE: 128603.0007
Fold 1 MAE: 74207.3023

Fold 2 Test R-squared score: 0.9102
Fold 2 Test Adjusted R-squared score: 0.9099
Fold 2 RMSE: 79865.955
Fold 2 MAE: 50231.9413

Fold 3 Test R-squared score: 0.933
Fold 3 Test Adjusted R-squared score: 0.9328
Fold 3 RMSE: 70643.9851
Fold 3 MAE: 45052.2246

Fold 4 Test R-squared score: 0.9572
Fold 4 Test Adjusted R-squared score: 0.9571
Fold 4 RMSE: 55698.3013
Fold 4 MAE: 36052.894

Fold 5 Test R-squared score: 0.9623
Fold 5 Test Adjusted R-squared score: 0.9622
Fold 5 RMSE: 53766.8087
Fold 5 MAE: 34021.8296

The average Train R-squared :  0.9706
The average Test R-squared :  0.9069
The average Test adjusted R-squared:  0.9066
The average deviation from actual value (RMSE):  77715.6102
The average deviation from actual value (MAE):  47913.2384
Difference between Train vs. Test R-squared:  6.37 %


### Tuning the hgbr model

In [47]:
hgbr = HistGradientBoostingRegressor(learning_rate = 0.1, l2_regularization = 180, max_iter = 130)
_ = price_ml(hgbr, original_train, original_test, verbose = 1)

Fold 1 Test R-squared score: 0.7868
Fold 1 Test Adjusted R-squared score: 0.786
Fold 1 RMSE: 124353.574
Fold 1 MAE: 72669.4375

Fold 2 Test R-squared score: 0.9111
Fold 2 Test Adjusted R-squared score: 0.9108
Fold 2 RMSE: 79484.2227
Fold 2 MAE: 49865.6236

Fold 3 Test R-squared score: 0.9341
Fold 3 Test Adjusted R-squared score: 0.9338
Fold 3 RMSE: 70098.5645
Fold 3 MAE: 44797.5931

Fold 4 Test R-squared score: 0.9609
Fold 4 Test Adjusted R-squared score: 0.9608
Fold 4 RMSE: 53261.4921
Fold 4 MAE: 35737.6749

Fold 5 Test R-squared score: 0.9627
Fold 5 Test Adjusted R-squared score: 0.9626
Fold 5 RMSE: 53491.3803
Fold 5 MAE: 33675.7578

The average Train R-squared :  0.9585
The average Test R-squared :  0.9111
The average Test adjusted R-squared:  0.9108
The average deviation from actual value (RMSE):  76137.8467
The average deviation from actual value (MAE):  47349.2174
Difference between Train vs. Test R-squared:  4.74 %


### How about using an ann model for prediction?

In [62]:
def ann_price_generator(input_shape):
    T = input_shape
    i = Input(shape = [T,])
    x = Dense(400)(i)
    x = Activation('relu')(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dropout(0.25)(x)
    x = Dense(2000)(x)
    x = Activation('relu')(x)
    x = Dropout(0.25)(x)
    x = Dense(400)(x)
    x = Activation('relu')(x)
    output = Dense(1)(x)
    model = Model(inputs = i, outputs = output)
    
    return model

In [63]:
###Transform the dataset into the range 0-1 nad price to np.log(price)
columns = ['livingAreaValue', 'lotArea', 'liv/lot_ratio', 'predicted_price/lotsqft', 'predicted_living_price',
           'predicted_price1', 'predicted_price2']
target = ['actual_price']

scaler = MinMaxScaler()
scaled_train_price = dict()
scaled_train_predict = dict()
scaled_test_price = dict()
scaled_test_predict = dict()

n_folds = 5
for i in range(n_folds):
    train_set = original_train['Fold ' + str(i)][columns].copy().to_numpy()
    train_predict = original_train['Fold ' + str(i)][target].copy().to_numpy()
    test_set = original_test['Fold ' + str(i)][columns].copy().to_numpy()
    test_predict = original_test['Fold ' + str(i)][target].copy().to_numpy()
    
    the_scaler1 = scaler
    scaled_train_price['Fold ' + str(i)] = the_scaler1.fit_transform(train_set)
    scaled_train_predict['Fold ' + str(i)] = np.log(train_predict).reshape(-1, 1)
    scaled_test_price['Fold ' + str(i)] = the_scaler1.transform(test_set)
    scaled_test_predict['Fold ' + str(i)] = np.log(test_predict).reshape(-1, 1)

In [64]:
Test_R2_list = list()
Test_adjusted_R2_list = list()
Train_R2_list = list()
rmse_list = list()
mae_list = list()

np.random.seed(123)
tf.random.set_seed(123)

input_shape = original_train['Fold 0'][columns].shape[1]
model_price = ann_price_generator(input_shape)
model_price.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mse'])

for i in range(5):
    train = scaled_train_price['Fold ' + str(i)]
    target_train = scaled_train_predict['Fold ' + str(i)]
    test = scaled_test_price['Fold ' + str(i)]
    target_test = scaled_test_predict['Fold ' + str(i)]

    check_point = ModelCheckpoint('Monthly_combination_ANN/ANN_price/model_price_adam_ann_' + 'Fold ' + str(i) + '.h5', 
                                   monitor = 'val_mse', save_best_only = True)
    model_price.fit(train, target_train, epochs = 80, validation_data = (test, target_test), 
                    batch_size = 32, callbacks = [check_point], verbose = 0)
    model_allvars = tf.keras.models.load_model('Monthly_combination_ANN/ANN_price/model_price_adam_ann_' + 'Fold ' + str(i) + '.h5')
    train_observed = original_train['Fold ' + str(i)][target].to_numpy().reshape(-1, 1)
    train_predicted = np.exp(model_allvars.predict(train))
    
    ###We have calculated the actual price already with lot * price/lot
    train_r_squared = r2_score(train_observed, train_predicted)
    
    ###For the test set
    test_observed = original_test['Fold ' + str(i)][target].to_numpy().reshape(-1, 1)
    test_predicted = np.exp(model_allvars.predict(test))   
    test_r_squared = r2_score(test_observed, test_predicted)
    
    r2 = test_r_squared
    n = test_observed.shape[0]
    p = train_set2.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
    mse = mean_squared_error(test_observed, test_predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test_observed, test_predicted)

    Test_R2_list.append(r2)
    Test_adjusted_R2_list.append(adjusted_r2)
    Train_R2_list.append(train_r_squared)
    rmse_list.append(rmse)
    mae_list.append(mae)
    
    print("Fold " + str(i + 1) + " Test R-squared score:", np.round(r2, 4))
    print("Fold " + str(i + 1) + " Test Adjusted R-squared score:", np.round(adjusted_r2, 4))
    print("Fold " + str(i + 1) + " RMSE:", np.round(rmse, 4))
    print("Fold " + str(i + 1) + " MAE:", np.round(mae, 4))
    print()
    
dif = np.round(sum(Train_R2_list)/len(Train_R2_list), 4) - np.round(sum(Test_R2_list)/len(Test_R2_list), 4)
dif = np.round(dif * 100, 2)
print("The average Train R-squared : ", np.round(sum(Train_R2_list)/len(Train_R2_list), 4))
print("The average Test R-squared : ", np.round(sum(Test_R2_list)/len(Test_R2_list), 4))
print("The average Test adjusted R-squared: ", np.round(sum(Test_adjusted_R2_list)/len(Test_adjusted_R2_list), 4))
print("The average deviation from actual value (RMSE): ", np.round(sum(rmse_list)/len(rmse_list), 4))
print("The average deviation from actual value (MAE): ", np.round(sum(mae_list)/len(mae_list), 4))
print("Difference between Train vs. Test R-squared: ", dif, "%")

Fold 1 Test R-squared score: 0.796
Fold 1 Test Adjusted R-squared score: 0.784
Fold 1 RMSE: 121627.8765
Fold 1 MAE: 71353.1529

Fold 2 Test R-squared score: 0.914
Fold 2 Test Adjusted R-squared score: 0.909
Fold 2 RMSE: 78155.8432
Fold 2 MAE: 48984.1934

Fold 3 Test R-squared score: 0.9369
Fold 3 Test Adjusted R-squared score: 0.9331
Fold 3 RMSE: 68597.086
Fold 3 MAE: 43601.4252

Fold 4 Test R-squared score: 0.9623
Fold 4 Test Adjusted R-squared score: 0.9601
Fold 4 RMSE: 52284.1369
Fold 4 MAE: 34558.841

Fold 5 Test R-squared score: 0.9649
Fold 5 Test Adjusted R-squared score: 0.9628
Fold 5 RMSE: 51928.9208
Fold 5 MAE: 33018.9664

The average Train R-squared :  0.9479
The average Test R-squared :  0.9148
The average Test adjusted R-squared:  0.9098
The average deviation from actual value (RMSE):  74518.7727
The average deviation from actual value (MAE):  46303.3158
Difference between Train vs. Test R-squared:  3.31 %


# We will stick this architecture to predict house price
1. Prep the data for DL
2. Run ANN_location on location data to generate predicted price/lotArea
3. Run ANN_properties on properties data to generate predicted living_price
4. Run ANN_price to predict the price of the house with 2 predictions above