In [81]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
import numpy as np

In [82]:
# bring in our Beverly Grove file

Beverly_Grove_df = pd.read_csv('Beverly_Grove_Cleaned.csv', index_col='PARCEL')
Beverly_Grove_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0
5510005006,1,0,6727 Drexel Ave,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0,0
5089024002,0,0,942 S Cochran Ave,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
Beverly_Grove_df = Beverly_Grove_df.dropna()

In [84]:
Beverly_Grove_df.dtypes

Owned by Trust?                int64
Owned by Business?             int64
SITEADDRESS                   object
MAIL DIFFERENT FROM SITE?      int64
MAIL OUTSIDE CA?               int64
TITLECO1                       int64
ASSDTOTAL                      int64
ASSDLAND                       int64
ASSDSTCT                       int64
ASSDOTHR                       int64
EXEMPTCD                       int64
EXMPTAMT                       int64
PCNTIMPD                       int64
TAXAMT                       float64
DOCDATE_YEAR                   int64
MULTIPARCEL                    int64
PRICE                          int64
LENDER1                        int64
LOANAMOUT1                     int64
LOANTYPE1                      int64
YEARBLT                        int64
EFFYRBLT                     float64
LOTSQFT                        int64
TOTALSF                        int64
GRGTYPE                        int64
BLDGMAT                        int64
TOTUNITS                       int64
Q

In [85]:
Beverly_Grove_df['TAXAMT'] = Beverly_Grove_df['TAXAMT'].astype(int)
Beverly_Grove_df['EFFYRBLT'] = Beverly_Grove_df['EFFYRBLT'].astype(int)

In [86]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Beverly_Grove_df = Beverly_Grove_df[~Beverly_Grove_df['SITEADDRESS'].str.contains('#')]
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
# Now drop the SiteAddress column
Beverly_Grove_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [88]:
# check to see if it dropped.
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,0,0,1,1157385,925908,231477,0,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1,0,0,57047,40994,16053,0,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,0,0,1,651485,521193,130292,0,1,...,0,0,0,0,0,0,0,0,0,0


In [89]:
len(Beverly_Grove_df)

4705

In [90]:
# Step 1: Create our X and y

y = Beverly_Grove_df['Did it sell?']
X = Beverly_Grove_df.drop(['Did it sell?'], 1)

In [91]:
APN = Beverly_Grove_df.index.values
APN

array([5085010022, 5085033008, 5085032022, ..., 5510018012, 5510017023,
       5514001029])

In [92]:
# Split our data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

In [93]:
X_train.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5088017017,0,0,0,0,1,340000,136000,204000,0,1,...,0,0,0,0,0,0,0,0,0,0
5089020010,0,0,0,0,0,218091,106556,111535,0,1,...,0,0,0,0,0,0,0,0,0,0
5088016028,1,0,0,0,1,195815,49267,146548,0,1,...,0,0,0,0,0,0,0,0,0,0


In [94]:
# Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [95]:
X_train_scaled_APN = []

In [96]:
for index, parcel in enumerate(X_train.index.values):
     X_train_scaled_APN.append(np.insert(X_train_scaled[index], 0, [parcel]))

In [97]:
# Create the learning model

log_classifier = LogisticRegression(solver='lbfgs',
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

# Train the model
log_classifier.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)

print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.592


In [99]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
1026,0,0,5510013025
690,1,0,5085004005
894,0,0,5088009006
1000,1,0,5085007005
468,1,1,5526021012
735,1,0,5510019025
267,1,1,5510017029
266,0,0,5085011009
975,0,0,5510018007
230,1,0,5511006002


In [100]:
# Define the basic neural network model
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=150, activation="relu", input_dim=55))
nn_model.add(tf.keras.layers.Dense(units=150, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=150, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=150, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=150, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=150, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=150, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=200)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
37/37 - 0s - loss: 0.7969 - accuracy: 0.8895
Loss: 0.7968716025352478, Accuracy: 0.889549732208252


In [47]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[666, 430],
       [ 50,  31]])

In [48]:
model_accuracy

0.9226847887039185

In [49]:
from sklearn.metrics import balanced_accuracy_score

In [50]:
balanced_accuracy_score(y_test, y_pred)

0.4951901414796792

In [52]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

0.1143911439114391

In [53]:
# print out the classification report

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

ModuleNotFoundError: No module named 'imblearn'

In [66]:
#y_pred = nn_model(X_test_scaled)
#predictions = fit_model.eval(feed_dict = {x:X_test_scaled})

AttributeError: 'History' object has no attribute 'eval'

In [55]:
# Convert our new, fitted model into a dataframe

new_fit_model_df = pd.DataFrame(X_train_scaled)
new_fit_model_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,-0.618859,-0.185205,-0.522673,-0.147378,0.740761,-0.644949,-0.865631,-0.071031,-0.01959,0.923548,...,-0.016838,-0.190861,-0.016838,0.0,-0.428364,-0.029173,0.0,-0.28672,-0.029173,-0.033691
1,-0.618859,-0.185205,-0.522673,-0.147378,-1.349963,-0.848467,-0.934399,-0.489487,-0.01959,0.923548,...,-0.016838,-0.190861,-0.016838,0.0,-0.428364,-0.029173,0.0,-0.28672,-0.029173,-0.033691
2,1.615876,-0.185205,-0.522673,-0.147378,0.740761,-0.885655,-1.068201,-0.331034,-0.01959,0.923548,...,-0.016838,-0.190861,-0.016838,0.0,-0.428364,-0.029173,0.0,-0.28672,-0.029173,-0.033691
3,-0.618859,-0.185205,-0.522673,-0.147378,0.740761,-0.064153,-0.298649,0.404786,-0.01959,-1.082781,...,-0.016838,-0.190861,-0.016838,0.0,-0.428364,-0.029173,0.0,-0.28672,-0.029173,-0.033691
4,1.615876,-0.185205,-0.522673,-0.147378,-1.349963,-0.584741,-0.773096,-0.08712,-0.01959,0.923548,...,-0.016838,-0.190861,-0.016838,0.0,-0.428364,-0.029173,0.0,-0.28672,-0.029173,-0.033691


## Now try a Neural Net with resampled data