## Housing Price Predictor
- Utilizes a dual model approach to predict the pricing of housing. 
- Created using data from H. Ahmed E. and Moustafa M. (2016). House Price Estimation from Visual and Textual Features.In Proceedings of the 8th International Joint Conference on Computational Intelligence (IJCCI 2016)ISBN 978-989-758-201-1, pages 62-68. DOI: 10.5220/0006040700620068


## Helper Methods
- Tools we may use later for data pre-processing and graphing

In [21]:
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import seaborn as sns
import os
import pandas as pd
import matplotlib.pyplot as plt

import csv
import numpy as np
%matplotlib inline

import cv2
import glob
from IPython.display import Image


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


## Data Extraction
- code involved with the extraction of the dataset

In [None]:
#import statements
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from pathlib import Path
import os
import sys
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt 


#reads text file data into dataframe (essentially a table)
TXT_DATASET_LOCATION = "./Houses Dataset/HousesInfo.txt"

cols=["Bedrooms","Bathrooms","area","zipcode","price"]

df = pd.read_csv(DATASET_LOCATION, sep= " ", header=None, names=cols)


y = df["price"]

# retrieve 

#df.head()

#

# How do I read from a text file

(535, 5)

In [None]:
#retrieval of images into arrays. 

## Image Handling
- Next we extract the images from the dataset and concatenate the 4 images in the dataset together

In [28]:
## Bathroom.jpg
new_images=[]
HOUSE_DATASET_LOCATION = "./Houses dataset/" 
for number in range(1, 536):
    for path in glob.glob("./Houses dataset/" + str(number) + "_bathroom.jpg"):
        if os.path.isfile(path):
            new_images.append(path) 

new_images

# creates variable to store all the images into appropriate columns
img= pd.DataFrame(new_images,columns = ['bathroom_img'])

In [31]:
## bedroom images
bedroom_images = []
for number in range(1, 536):
    for path in glob.glob(HOUSE_DATASET_LOCATION + str(number) + "_bedroom.jpg"):
        if os.path.isfile(path):
            bedroom_images.append(path) 

frontal_images = []
for number in range(1, 536):
    for path in glob.glob(HOUSE_DATASET_LOCATION + str(number) + "_frontal.jpg"):
        if os.path.isfile(path):
            frontal_images.append(path) 

kitchen_images = []
for number in range(1, 536):
    for path in glob.glob(HOUSE_DATASET_LOCATION + str(number) + "_kitchen.jpg"):
        if os.path.isfile(path):
            kitchen_images.append(path) 

img['bedroom_img']=bedroom_images
img['frontal_img']=frontal_images
img['kitchen_img']=kitchen_images
img.head()

Unnamed: 0,bathroom_img,bedroom_img,frontal_img,kitchen_img
0,./Houses dataset/1_bathroom.jpg,./Houses dataset/1_bedroom.jpg,./Houses dataset/1_frontal.jpg,./Houses dataset/1_kitchen.jpg
1,./Houses dataset/2_bathroom.jpg,./Houses dataset/2_bedroom.jpg,./Houses dataset/2_frontal.jpg,./Houses dataset/2_kitchen.jpg
2,./Houses dataset/3_bathroom.jpg,./Houses dataset/3_bedroom.jpg,./Houses dataset/3_frontal.jpg,./Houses dataset/3_kitchen.jpg
3,./Houses dataset/4_bathroom.jpg,./Houses dataset/4_bedroom.jpg,./Houses dataset/4_frontal.jpg,./Houses dataset/4_kitchen.jpg
4,./Houses dataset/5_bathroom.jpg,./Houses dataset/5_bedroom.jpg,./Houses dataset/5_frontal.jpg,./Houses dataset/5_kitchen.jpg


In [None]:
# Code which concatenates houses images into one image for each house
images_output=[]
for row_index,row in img.iterrows():
            inputImages=[]
            outputImage = np.zeros((128, 128, 3), dtype="uint8")
            image_temp1 = cv2.imread(row.bathroom_img)
            image1 = cv2.resize(image_temp1, (64 , 64))
            
            image_temp2 = cv2.imread(row.bedroom_img)
            image2 = cv2.resize(image_temp2, (64 , 64))
            
            image_temp3 = cv2.imread(row.frontal_img)
            image3 = cv2.resize(image_temp3, (64 , 64))
            
            image_temp4 = cv2.imread(row.kitchen_img)
            image4 = cv2.resize(image_temp4, (64 , 64))
              
            inputImages.append(image1)
            inputImages.append(image2)
            inputImages.append(image3)
            inputImages.append(image4)
            
            outputImage[0:64, 0:64] = inputImages[0]
            outputImage[0:64, 64:128] = inputImages[1]
            outputImage[64:128, 64:128] = inputImages[2]
            outputImage[64:128, 0:64] = inputImages[3]
            
            #uncomment this if you want to see a boatload of images that it is concatenating
            #images_output.append(outputImage)      
            