In [62]:
"""Import libraries"""

import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

In [72]:
"""Initialize data"""

input_data = {
    "area": "100",
    "building-state": "JUST RENOVATED",
    "equipped-kitchen": "True",
    "facades-number": "2",
    "full-address": "Sportstraat 166 9000 Ghent",
    "furnished": "False",
    "garden": "False",
    "garden-area": "None",
    "land-area": "None",
    "open-fire": "False",
    "property-type": "APARTMENT",
    "rooms-number": "",
    "swimming-pool": "False",
    "terrace": "True",
    "terrace-area": "2",
    "zip-code": "9000"
}

output_data = {}

In [73]:
"""Function to load input dictionary into a dataframe"""

def load_input(input_data):
    df_input = pd.DataFrame(input_data, index=[0])
    df_input = df_input.reindex(sorted(df_input.columns), axis=1)
    return df_input

In [79]:
"""Function to check if required information is not empty"""

def check_required_missing(df_input):
    switch = 0
    while switch == 0:
        if df_input.loc[0, "area"] == "":
            output_data["error"] = "missing required information for 'area'"
            output_data["status code"] = 400
            return output_data
        if df_input.loc[0, "property-type"] == "":
            output_data["error"] = "missing required information for 'property-type'"
            output_data["status code"] = 400
            return output_data
        if df_input.loc[0, "rooms-number"] == "":
            output_data["error"] = "missing required information for 'rooms-number'"
            output_data["status code"] = 400
            return output_data
        if df_input.loc[0, "zip-code"] == "":
            output_data["error"] = "missing required information for 'zip-code'"
            output_data["status code"] = 400
            return output_data
        switch =+ 1
    return output_data

In [80]:
test = load_input(input_data)
a = check_required_missing(test)

a

{'error': "missing required information for 'rooms-number'",
 'status code': 400}

In [66]:
"""Function to check if required information is in correct format"""

def check_required_string(df_input):
    switch = 0
    while switch == 0:
        if is_string_dtype(df_input.loc[0, "area"]): 
            output_data["error"] = "'area' information must be an integer"
            output_data["status code"] = 400
            return output_data
        if is_string_dtype(df_input.loc[0, "rooms-number"]):
            output_data["error"] = "'rooms-number' information must be an integer"
            output_data["status code"] = 400
            return output_data
        if is_string_dtype(df_input.loc[0, "zip-code"]):
            output_data["error"] = "'zip-code' information must be an integer"
            output_data["status code"] = 400
            return output_data
        switch =+ 1
    return output_data

In [67]:
"""Function to check if 'property-type' is in correct format"""

def check_property_type(df_input):
    if df_input.loc[0, "property-type"] == "APARTMENT" or df_input.loc[0, "property-type"] == "HOUSE":
        return output_data
    else:
        output_data["error"] = "'property-type' information can only be 'APARTMENT' or 'HOUSE'"
        output_data["status code"] = 400
        return output_data

In [68]:
"""Function to rename input features to match model training features and drop features that won't be used for modeling if all required information are available and in 
correct format"""

def input_features(df_input):
    df_input.rename(columns = {"area":"livingArea", "property-type":"type", "rooms-number":"bedrooms", "zip-code":"postalCode", "equipped-kitchen":"kitchenType", 
    "facades-number":"numberOfFrontages", "building-state":"buildingCondition"}, inplace=True)
    df_input.drop(df_input.columns[12:15], inplace=True, axis=1)
    df_input.drop(df_input.columns[4:10], inplace=True, axis=1)
    df_input = df_input.reindex(sorted(df_input.columns), axis=1)
    return df_input

In [69]:
"""Function to transform input values"""

def input_values(df_input_features):
    if df_input_features.loc[0, "type"] == "APARTMENT":
        df_input_features.loc[0, "type"] = 0
    elif df_input_features.loc[0, "type"] == "HOUSE":
        df_input_features.loc[0, "type"] = 1
    df_input_features["type"] = df_input_features["type"].astype(int)

    df_input_features["bedrooms"] = df_input_features["bedrooms"].astype(int)

    df_input_features["livingArea"] = df_input_features["livingArea"].astype(int)

    df_input_features["postalCode"] = df_input_features["postalCode"].astype(str)

    if df_input_features.loc[0, "kitchenType"] == False:
        df_input_features.loc[0, "kitchenType"] = 0
    elif df_input_features.loc[0, "kitchenType"] == True:
        df_input_features.loc[0, "kitchenType"] = 1
    else:
        df_input_features.loc[0, "kitchenType"] = 0
    df_input_features["kitchenType"] = df_input_features["kitchenType"].astype(int)

    if df_input_features.loc[0, "buildingCondition"] == "NEW":
        df_input_features.loc[0, "buildingCondition"] = 1
    elif df_input_features.loc[0, "buildingCondition"] == "JUST RENOVATED":
        df_input_features.loc[0, "buildingCondition"] = 2
    elif df_input_features.loc[0, "buildingCondition"] == "GOOD":
        df_input_features.loc[0, "buildingCondition"] = 3
    elif df_input_features.loc[0, "buildingCondition"] ==  "TO RENOVATE":
        df_input_features.loc[0, "buildingCondition"] = 5
    elif df_input_features.loc[0, "buildingCondition"] ==  "TO REBUILD":
        df_input_features.loc[0, "buildingCondition"] = 6
    else:
        df_input_features.loc[0, "buildingCondition"] = 0
    df_input_features["buildingCondition"] = df_input_features["buildingCondition"].astype(int)

    if df_input_features["numberOfFrontages"].isnull().values.any():
        df_input_features.loc[0, "numberOfFrontages"] = 1
    if is_string_dtype(df_input_features.loc[0, "numberOfFrontages"]):
        df_input_features.loc[0, "numberOfFrontages"] = 1
    df_input_features["numberOfFrontages"] = df_input_features["numberOfFrontages"].astype(int)

    return df_input_features

In [70]:
"""This is the preprocessing function"""

def preprocess(input_data):
    df_input = load_input(input_data)
    output_missing = check_required_missing(df_input)
    if len(output_missing) != 0:
        return output_missing
    output_string = check_required_string(df_input)
    if len(output_string) != 0:
        return output_string
    output_type = check_property_type(df_input)
    if len(output_type) != 0:
        return output_type
    df_input_features = input_features(df_input)
    df_input_values = input_values(df_input_features)
    return df_input_values

In [71]:
for_prediction = preprocess(input_data)

sample_input = '../../../data/sample_input.csv'

for_prediction.to_csv(sample_input, index=False, header=True)