In [1]:
"""Import libraries"""

import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

In [2]:
"""Initialize data"""

train_test_data = '../../../data/properties_filtered.csv'

In [3]:
"""Function to load train-test data into a dataframe"""

def load_train_test(train_test_data):
    df_train_test = pd.read_csv(train_test_data)
    return df_train_test

In [4]:
"""Function to drop model training features that won't be used for modeling then sort dataframe"""

def train_test_features(df_train_test):
    df_train_test.drop(df_train_test.columns[13:20], inplace=True, axis=1)
    df_train_test.drop(df_train_test.columns[5:8], inplace=True, axis=1)
    df_train_test.drop(df_train_test.columns[1:4], inplace=True, axis=1)
    df_train_test = df_train_test.reindex(sorted(df_train_test.columns), axis=1)   #sort features alphabetically
    return df_train_test

In [5]:
"""Function to transform train-test values"""

def train_test_values(df_train_test_features):
    df_train_test_features["postalCode"] = df_train_test_features["postalCode"].astype(str)   #convert 'postalCode' to string type
    df_train_test_features["kitchenType"] = df_train_test_features["kitchenType"].replace([1, 2], 0)   #convert 'kitchenType' to binary (bool)
    df_train_test_features["kitchenType"] = df_train_test_features["kitchenType"].replace([3, 4, 5, 6, 7, 8], 1)
    return df_train_test_features

In [7]:
"""Function to define categorical features and run them through OneHotEncoder, define non-categorical features then merge them with encoded data"""

def one_hot_encoder(df_train_test_values):
    type_column = df_train_test_values["type"].values   #convert 'type' to numpy array
    ohe_column = ["postalCode"]   #declare categorical feaures
    ohe = OneHotEncoder()   #declare OneHotEncoder
    ohe_fit_transform = ohe.fit_transform(df_train_test_values[ohe_column]).toarray()   #fit-transform categorical features then convert to numpy array
    ohe_data = np.column_stack((type_column, ohe_fit_transform))   #merge 'type' with encoded categorical features
    joblib.dump(ohe, 'encoder.joblib')   #save onehotencoder to joblib
    non_ohe_columns = ["bedrooms", "buildingCondition", "kitchenType", "livingArea", "numberOfFrontages"]   #declare non-categorical features
    non_ohe_data = df_train_test_values[non_ohe_columns].values   #convert non-categorical features to numpy array
    X = np.column_stack((ohe_data, non_ohe_data))   #merge data as X
    return X

In [8]:
"""Function to split X and y into train and test sets"""

def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    return X_train, X_test, y_train, y_test

In [9]:
"""Function to scale the datasets"""

def scaler(X_train, X_test):
    sc = MinMaxScaler()   #declare scaler
    X_train = sc.fit_transform(X_train)   #fit-transform X_train
    X_test = sc.fit_transform(X_test)   #fit-transform X_test
    joblib.dump(sc, 'scaler.joblib')   #save scaler to joblib
    return X_train, X_test

In [10]:
"""Function to declare model, fit and evaluate"""

def regression_model(X_train, X_test, y_train, y_test):
    regressor = RandomForestRegressor(n_estimators= 100, random_state= 0)   #declare random forest regressor model
    regressor.fit(X_train, y_train)   #fit training dataset
    train_score = regressor.score(X_train, y_train)   #training score
    test_score = regressor.score(X_test, y_test)   #test score
    joblib.dump(regressor, 'model.joblib')   #save model to joblib
    return train_score, test_score

In [14]:
"""This is the modeling function"""

def model(train_test_data):
    df_train_test = load_train_test(train_test_data)   #load dataset
    price = df_train_test[["price"]]   #declare 'price' feature
    df_train_test_features = train_test_features(df_train_test)   #remove features that won't be used in the model
    df_train_test_values = train_test_values(df_train_test_features)   #transform train-test values
    X = one_hot_encoder(df_train_test_values)   #declare X variable
    y = price.values   #declare Y variable
    X_train, X_test, y_train, y_test = split_data(X, y)   #split dataset
    X_train, X_test = scaler(X_train, X_test)   #scale dataset
    train_score, test_score = regression_model(X_train, X_test, y_train, y_test)   #run model
    return train_score, test_score