In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import os, time, pickle
from keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

Using TensorFlow backend.


In [2]:
# Load in data
data = pd.read_csv(os.path.join("..","..","resources","data","model_data.csv"))
data.head()

Unnamed: 0,RecordID,Acerage,List_Price,Address,Cities_Towns,County,Style,Full_Baths,Half_Baths,Bedrooms,...,Basmnt_Desc,Exterior,Cool_System,Heat_System,Water,Sewer,FHA_55_,Land_Assessment,Building_Assessment,Tax_Amount
0,2,0.55,219000,205 MAIN ST,Andover Boro,Sussex,Victorian,1,0,3,...,1,0,0,1,1,0,0,72700,106300,5219
1,3,0.38,389000,120 MAIN ST,Andover Boro,Sussex,Victorian,2,0,4,...,1,1,0,0,1,0,0,70300,162900,7100
2,4,0.32,144900,365 Mohegan CI,Andover Twp.,Sussex,Multi Floor Unit,1,1,2,...,0,0,1,1,1,1,0,50000,62200,4031
3,5,0.34,148888,8 LINDA LN,Andover Twp.,Sussex,Cape Cod,1,0,3,...,1,1,0,1,0,0,0,88400,89800,6402
4,6,2.15,169900,4 HILLTOP RD,Andover Twp.,Sussex,Cape Cod,1,0,4,...,1,0,0,1,0,0,0,100800,162100,9445


In [3]:
# Create filtered datasets for model creation
unionData = data[data['County'] == 'Union']
sussexData = data[data['County'] == 'Sussex']

In [4]:
# Define model creator
def classifierCreator(readyData, label, colsToDrop):
    # Store labels and features then split data for training and testing model
    y = readyData[label].values.reshape(-1,1)
    X = readyData.drop(colsToDrop, axis=1)
    X_dummy = pd.get_dummies(X)
    X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, random_state=round(time.time()))
    
    # Dummy the categorical data
    dummy_train = pd.get_dummies(X_train)
    dummy_test = pd.get_dummies(X_test)
    
    # Create, train, and test random forest classifier
    randForest = RandomForestClassifier(n_estimators=200)
    randForest = randForest.fit(dummy_train, y_train)
    testScore = randForest.score(dummy_test, y_test)
    modelImportances = sorted(zip(randForest.feature_importances_, dummy_train.columns), reverse=True)
    
    return randForest, testScore, modelImportances

In [5]:
# Function to show readable results of model predictions
def displayResults(probs_list, modelName):
    width = 34
    
    print(modelName.center(width))
    print('')
    
    if modelName == 'countyModel':
        print('Probabilities of Counties'.center(width))
        print('County' + 'Probability'.rjust(width - len('County')))
        print('----------------------------------')

        for i in range(0,2):
            print('----------------------------------')
            print(probs_list[i][0] + f'{probs_list[i][1] * 100}%'.rjust(width - len(probs_list[i][0])))
    
    else:
        print('Probabilities 5 Most Likely Towns'.center(width))
        print('Town' + 'Probability'.rjust(width - len('County')))
        print('----------------------------------')

        for i in range(0,5):
            print('----------------------------------')
            print(probs_list[i][0] + f'{round(probs_list[i][1] * 100, 2)}%'.rjust(width - len(probs_list[i][0])))

In [6]:
# Model to predict pist price of houses. This was our original idea that did not pan out very well
y_lp = data['List_Price'].values.reshape(-1,1)
X_lp = data.drop(['RecordID','Address','List_Price','Basmnt_Desc','Exterior','Water','Sewer','FHA_55_'], axis=1)
X_lp_dummy = pd.get_dummies(X_lp)
X_lp_train, X_lp_test, y_lp_train, y_lp_test = train_test_split(X_lp_dummy, y_lp, random_state=round(time.time()))

# Dummy the categorical data
dummy_train_lp = pd.get_dummies(X_lp_train)
dummy_test_lp = pd.get_dummies(X_lp_test)

# Fit and score model
lpReg = LinearRegression().fit(dummy_train_lp, y_lp_train)
lpReg.score(dummy_test_lp, y_lp_test)

0.47220946841361566

In [7]:
# List Attributes of model that are most heavily weighted
coeffs = pd.concat([pd.DataFrame(dummy_train_lp.columns),pd.DataFrame(np.transpose(np.abs(lpReg.coef_)))], axis = 1)
coeffs.columns = ['Attribute','Coefficient']
coeffs = coeffs.sort_values(by='Coefficient', ascending=False).round(4)
coeffs.head(10)

Unnamed: 0,Attribute,Coefficient
58,Style_Carriage House,396943.534
75,Style_Log Home,286965.6504
46,Cities_Towns_Summit City,257025.5451
33,Cities_Towns_Mountainside Boro,202466.5743
22,Cities_Towns_Garwood Boro,192711.8495
34,Cities_Towns_New Providence Boro,166606.8916
12,Cities_Towns_Berkeley Heights Twp.,165882.3705
10,Cities_Towns_Andover Boro,158697.3723
19,Cities_Towns_Frankford Twp.,154908.4119
59,Style_Chalet,142888.2768


In [8]:
# Create model using all data to classify county
countyModel, cmScore, cmImportances = classifierCreator(data, 'County', ['Cities_Towns','County','RecordID','Address','Style'])

# Create model using all data to classify town
allTownModel, atmScore, atmImportances = classifierCreator(data, 'Cities_Towns', ['Cities_Towns','County','RecordID','Address','Style', 'Exterior'])

# Create model using Sussex County data to classify town
sussexTownModel, stmScore, stmImportances = classifierCreator(sussexData, 'Cities_Towns', ['Cities_Towns','County','RecordID','Address','Style', 'Exterior'])

# Create model using Union County data to classify town
unionTownModel, utmScore, utmImportances = classifierCreator(unionData, 'Cities_Towns', ['Cities_Towns','County','RecordID','Address','Style', 'Exterior'])

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


In [9]:
# Show model scores
print(f'''
countyModel Score:       {cmScore}
allTownModel Score:      {atmScore}
sussexTownModel Score:   {stmScore}
unionTownModel Score:    {utmScore}
''')


countyModel Score:       0.960820895522388
allTownModel Score:      0.6156716417910447
sussexTownModel Score:   0.6028368794326241
unionTownModel Score:    0.7254901960784313



Required input format for predictions:
countyModel:[[
            Acerage                                                  (float), 
            list price                                               (integer), 
            number of full baths                                     (integer), 
            number of half baths                                     (integer), 
            number of bedrooms                                       (integer), 
            number of total rooms                                    (integer), 
            presence of basement                                     (0=No, 1=Yes), 
            presence of exterior features                            (0=No, 1=Yes), 
            presence of cooling system                               (0=No, 1=Yes), 
            type of heating system                                   (0=Radiator, 1=Other), 
            ownership of water supply                                (0=Private, 1=Public), 
            ownership of sewer system                                (0=Private, 1=Public), 
            located within 55 year old and up living area            (0=No, 1=Yes), 
            land assessment value                                    (integer), 
            building assessment value                                (integer), 
            tax amount                                               (integer)
            ]]
            
allTownModel, sussexTownModel, unionTownModel:[[
            Acerage                                                  (float), 
            list price                                               (integer), 
            number of full baths                                     (integer), 
            number of half baths                                     (integer), 
            number of bedrooms                                       (integer), 
            number of total rooms                                    (integer), 
            presence of basement                                     (0=No, 1=Yes), 
            presence of cooling system                               (0=No, 1=Yes), 
            type of heating system                                   (0=Radiator, 1=Other), 
            ownership of water supply                                (0=Private, 1=Public), 
            ownership of sewer system                                (0=Private, 1=Public), 
            located within 55 year old and up living area            (0=No, 1=Yes), 
            land assessment value                                    (integer), 
            building assessment value                                (integer), 
            tax amount                                               (integer)
            ]]

In [10]:
# Sample prediction property: 811 Passaic Ave., Linden City, Union County
# countyModel: [[0.14,180000,2,1,4,7,1,0,1,1,1,0,0,51000,63300,7871]]
# allTownModel, sussexTownModel, unionTownModel: [[0.14,180000,2,1,4,7,1,1,1,1,0,0,51000,63300,7871]]
cmClassification = sorted(zip(countyModel.classes_, countyModel.predict_proba([[0.14,180000,2,1,4,7,1,0,1,1,1,0,0,51000,63300,7871]])[0]), key=lambda x: x[1], reverse=True)
displayResults(cmClassification, 'countyModel')

           countyModel            

    Probabilities of Counties     
County                 Probability
----------------------------------
----------------------------------
Sussex                       64.0%
----------------------------------
Union                        36.0%


In [11]:
atmClassification = sorted(zip(allTownModel.classes_, allTownModel.predict_proba([[0.14,180000,2,1,4,7,1,1,1,1,0,0,51000,63300,7871]])[0]), key=lambda x: x[1], reverse=True)
displayResults(atmClassification, 'allTownModel')

           allTownModel           

Probabilities 5 Most Likely Towns 
Town                 Probability
----------------------------------
----------------------------------
Plainfield City              10.5%
----------------------------------
Vernon Twp.                  10.0%
----------------------------------
Byram Twp.                    7.0%
----------------------------------
Montague Twp.                 6.5%
----------------------------------
Hardyston Twp.                6.0%


In [12]:
stmClassification = sorted(zip(sussexTownModel.classes_, sussexTownModel.predict_proba([[0.14,180000,2,1,4,7,1,1,1,1,0,0,51000,63300,7871]])[0]), key=lambda x: x[1], reverse=True)
displayResults(stmClassification, 'sussexTownModel')

         sussexTownModel          

Probabilities 5 Most Likely Towns 
Town                 Probability
----------------------------------
----------------------------------
Vernon Twp.                  12.5%
----------------------------------
Wantage Twp.                 12.0%
----------------------------------
Newton Town                  11.5%
----------------------------------
Hamburg Boro                 10.5%
----------------------------------
Hopatcong Boro               10.0%


In [13]:
utmClassification = sorted(zip(unionTownModel.classes_, unionTownModel.predict_proba([[0.14,180000,2,1,4,7,1,1,1,1,0,0,51000,63300,7871]])[0]), key=lambda x: x[1], reverse=True)
displayResults(utmClassification, 'unionTownModel')

          unionTownModel          

Probabilities 5 Most Likely Towns 
Town                 Probability
----------------------------------
----------------------------------
Plainfield City              27.0%
----------------------------------
Berkeley Heights Twp.        19.0%
----------------------------------
Roselle Boro                 19.0%
----------------------------------
Clark Twp.                   13.0%
----------------------------------
Elizabeth City                7.0%


In [14]:
# Sample prediction property: 44 River Rd., Montague Twp., Sussex County
# countyModel: [[1.10,165000,1,0,3,6,1,1,0,0,0,0,0,59600,73700,3615]]
# allTownModel, sussexTownModel, unionTownModel: [[1.10,165000,1,0,3,6,1,0,0,0,0,0,59600,73700,3615]]
cmClassification = sorted(zip(countyModel.classes_, countyModel.predict_proba([[1.10,165000,1,0,3,6,1,1,0,0,0,0,0,59600,73700,3615]])[0]), key=lambda x: x[1], reverse=True)
displayResults(cmClassification, 'countyModel')

           countyModel            

    Probabilities of Counties     
County                 Probability
----------------------------------
----------------------------------
Sussex                      100.0%
----------------------------------
Union                         0.0%


In [15]:
atmClassification = sorted(zip(allTownModel.classes_, allTownModel.predict_proba([[1.10,165000,1,0,3,6,1,0,0,0,0,0,59600,73700,3615]])[0]), key=lambda x: x[1], reverse=True)
displayResults(atmClassification, 'allTownModel')

           allTownModel           

Probabilities 5 Most Likely Towns 
Town                 Probability
----------------------------------
----------------------------------
Vernon Twp.                  27.0%
----------------------------------
Stillwater Twp.              25.5%
----------------------------------
Hampton Twp.                 13.0%
----------------------------------
Montague Twp.                11.0%
----------------------------------
Hardyston Twp.                7.0%


In [16]:
stmClassification = sorted(zip(sussexTownModel.classes_, sussexTownModel.predict_proba([[1.10,165000,1,0,3,6,1,0,0,0,0,0,59600,73700,3615]])[0]), key=lambda x: x[1], reverse=True)
displayResults(stmClassification, 'sussexTownModel')

         sussexTownModel          

Probabilities 5 Most Likely Towns 
Town                 Probability
----------------------------------
----------------------------------
Vernon Twp.                  35.5%
----------------------------------
Stillwater Twp.              24.5%
----------------------------------
Hampton Twp.                 10.5%
----------------------------------
Montague Twp.                 7.0%
----------------------------------
Andover Twp.                  4.5%


In [17]:
utmClassification = sorted(zip(unionTownModel.classes_, unionTownModel.predict_proba([[1.10,165000,1,0,3,6,1,0,0,0,0,0,59600,73700,3615]])[0]), key=lambda x: x[1], reverse=True)
displayResults(utmClassification, 'unionTownModel')

          unionTownModel          

Probabilities 5 Most Likely Towns 
Town                 Probability
----------------------------------
----------------------------------
Berkeley Heights Twp.        33.0%
----------------------------------
Clark Twp.                   25.5%
----------------------------------
Plainfield City              14.0%
----------------------------------
Roselle Park Boro             7.5%
----------------------------------
Roselle Boro                  5.0%


The below cells are used to save models. They are currently commented out to prevent overwriting current models.

In [18]:
with open(os.path.join('countyModel'), 'wb') as f:
    pickle.dump(countyModel, f)

In [19]:
with open(os.path.join('allTownModel'), 'wb') as f:
    pickle.dump(allTownModel, f)

In [20]:
with open(os.path.join('sussexTownModel'), 'wb') as f:
    pickle.dump(sussexTownModel, f)

In [21]:
with open(os.path.join('unionTownModel'), 'wb') as f:
    pickle.dump(unionTownModel, f)

In [22]:
with open(os.path.join('listPriceModel'), 'wb') as f:
    pickle.dump(lpReg, f)