In [1]:
# import our dependencies

%matplotlib inline
from collections import Counter
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import numpy as np

In [2]:
# import our file

Beverly_Grove_df = pd.read_csv('Beverly_Grove_Cleaned.csv', index_col='PARCEL')
Beverly_Grove_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0
5510005006,1,0,6727 Drexel Ave,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0,0
5089024002,0,0,942 S Cochran Ave,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
Beverly_Grove_df = Beverly_Grove_df.dropna()

In [4]:
Beverly_Grove_df.dtypes

Owned by Trust?                int64
Owned by Business?             int64
SITEADDRESS                   object
MAIL DIFFERENT FROM SITE?      int64
MAIL OUTSIDE CA?               int64
TITLECO1                       int64
ASSDTOTAL                      int64
ASSDLAND                       int64
ASSDSTCT                       int64
ASSDOTHR                       int64
EXEMPTCD                       int64
EXMPTAMT                       int64
PCNTIMPD                       int64
TAXAMT                       float64
DOCDATE_YEAR                   int64
MULTIPARCEL                    int64
PRICE                          int64
LENDER1                        int64
LOANAMOUT1                     int64
LOANTYPE1                      int64
YEARBLT                        int64
EFFYRBLT                     float64
LOTSQFT                        int64
TOTALSF                        int64
GRGTYPE                        int64
BLDGMAT                        int64
TOTUNITS                       int64
Q

In [5]:
Beverly_Grove_df['TAXAMT'] = Beverly_Grove_df['TAXAMT'].astype(int)
Beverly_Grove_df['EFFYRBLT'] = Beverly_Grove_df['EFFYRBLT'].astype(int)

In [6]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Beverly_Grove_df = Beverly_Grove_df[~Beverly_Grove_df['SITEADDRESS'].str.contains('#')]
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Now drop the SiteAddress column
Beverly_Grove_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [8]:
# check to see if it dropped.
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,0,0,1,1157385,925908,231477,0,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1,0,0,57047,40994,16053,0,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,0,0,1,651485,521193,130292,0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
len(Beverly_Grove_df)

4705

# Attempt 1: Run a standard Logistic Regression

In [10]:
# Step 1: Create our X and y

y = Beverly_Grove_df['Did it sell?']
X = Beverly_Grove_df.drop(['Did it sell?'], 1)

In [11]:
APN = Beverly_Grove_df.index.values
APN

array([5085010022, 5085033008, 5085032022, ..., 5510018012, 5510017023,
       5514001029])

In [12]:
# Split our data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

In [13]:
X_train.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5088017017,0,0,0,0,1,340000,136000,204000,0,1,...,0,0,0,0,0,0,0,0,0,0
5089020010,0,0,0,0,0,218091,106556,111535,0,1,...,0,0,0,0,0,0,0,0,0,0
5088016028,1,0,0,0,1,195815,49267,146548,0,1,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X_train.columns

Index(['Owned by Trust?', 'Owned by Business?', 'MAIL DIFFERENT FROM SITE?',
       'MAIL OUTSIDE CA?', 'TITLECO1', 'ASSDTOTAL', 'ASSDLAND', 'ASSDSTCT',
       'ASSDOTHR', 'EXEMPTCD', 'EXMPTAMT', 'PCNTIMPD', 'TAXAMT',
       'DOCDATE_YEAR', 'MULTIPARCEL', 'PRICE', 'LENDER1', 'LOANAMOUT1',
       'LOANTYPE1', 'YEARBLT', 'EFFYRBLT', 'LOTSQFT', 'TOTALSF', 'GRGTYPE',
       'BLDGMAT', 'TOTUNITS', 'QUALCLAS', 'BLDGCOND', 'NOSTORY', 'ROOFMAT',
       'FOUNDATN', 'BEDROOMS', 'BATHROOMS', 'FAMILYRM', 'DININGRM', 'POOL',
       'PATIO', 'FIREPLCE', 'AIRMTHOD', 'HEATMTHD', 'VIEW', 'GRANT DEED',
       'IN LIEU OF FORC', 'INTERSPOUSAL', 'JOINT TENANT', 'NAME CHANGE',
       'Other', 'PARTNERSHIP', 'PERSONAL REP', 'QUIT CLAIM', 'RE-RECORD',
       'TAX DEED', 'TRUST TRANSFER', 'TRUSTEES', 'WARRANTY'],
      dtype='object')

In [15]:
# Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
X_train_scaled_APN = []

In [17]:
for index, parcel in enumerate(X_train.index.values):
     X_train_scaled_APN.append(np.insert(X_train_scaled[index], 0, [parcel]))

In [18]:
# Create the learning model

classifier = LogisticRegression(solver='lbfgs',
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

In [19]:
# fit our learning model to our scaled training data

classifier.fit(X_train_scaled, y_train)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [20]:
y_pred = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
854,1,0,5526025015
375,1,0,5514010038
506,0,0,5089017037
867,0,0,5510018008
373,0,0,5514012013
1068,1,0,5087002005
955,0,0,5089021001
563,1,0,5526023023
836,1,0,5087009017
352,0,0,5525021017


In [21]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5921835174171622


In [22]:
from sklearn.metrics import balanced_accuracy_score
print(balanced_accuracy_score(y_test, y_pred))

0.4951901414796792


In [23]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.6923828010441901

In [24]:
# See our results in a confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[666, 430],
       [ 50,  31]])

In [25]:
# evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.61      0.74      1096
           1       0.07      0.38      0.11        81

    accuracy                           0.59      1177
   macro avg       0.50      0.50      0.42      1177
weighted avg       0.87      0.59      0.69      1177



In [26]:
Beverly_Grove_df = Beverly_Grove_df.merge(results, how="left", right_on="PARCEL", left_on="PARCEL")
Beverly_Grove_df.head()

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY,Prediction,Actual
0,5085010022,0,0,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,,
1,5085033008,0,0,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0.0,0.0
2,5085032022,0,0,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,,
3,5510005006,1,0,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0.0,0.0
4,5089024002,0,0,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,,


# Attempt 2: Resample our data using SMOTEEN

In [27]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 2095, 1: 3245})

In [28]:
model_2 = LogisticRegression(solver='lbfgs', random_state=78, max_iter=300, class_weight="balanced")
model_2.fit(X_resampled, y_resampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [29]:
# Calculated the balanced accuracy score
y_pred = model_2.predict(X_test_scaled)

In [30]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5283635216725241

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5794392523364486

In [32]:
f1_score(y_test, y_pred, average="weighted")

0.6818244603198075

In [33]:
confusion_matrix(y_test, y_pred)

array([[644, 452],
       [ 43,  38]])

In [34]:
# evaluate
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.59      0.72      1096
           1       0.08      0.47      0.13        81

    accuracy                           0.58      1177
   macro avg       0.51      0.53      0.43      1177
weighted avg       0.88      0.58      0.68      1177



# Attempt 3: Resample our Data using SMOTE

In [35]:
from imblearn.over_sampling import SMOTE
X_resampled_2, y_resampled_2 = SMOTE(random_state=1, 
                               sampling_strategy='auto').fit_resample(
                               X_train_scaled, y_train)

In [36]:
Counter(y_resampled_2)

Counter({0: 3287, 1: 3287})

In [37]:
# perform our logistic regression

model_3 = LogisticRegression(solver='lbfgs', random_state=78, max_iter=300, class_weight="balanced")
model_3.fit(X_resampled_2, y_resampled_2)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [38]:
# Calculated the balanced accuracy score
y_pred = model_3.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.5146492295214923

In [39]:
accuracy_score(y_test, y_pred)

0.5751911639762107

In [40]:
f1_score(y_test, y_pred, average="weighted")

0.6785693826192267

# Attempt 4: Use a Support Vector Machine

In [41]:
# Instantiate a linear SVM model
from sklearn.svm import SVC
model_4 = SVC(kernel='linear')

In [42]:
#model_4.class_weight_

In [43]:
#model_4.coef_

In [44]:
#pd.Series(abs(model_4.coef_[0]), index=X_train.columns).nlargest(20).plot(kind='barh')

In [45]:
Beverly_Grove_df.columns

Index(['PARCEL', 'Owned by Trust?', 'Owned by Business?',
       'MAIL DIFFERENT FROM SITE?', 'MAIL OUTSIDE CA?', 'TITLECO1',
       'ASSDTOTAL', 'ASSDLAND', 'ASSDSTCT', 'ASSDOTHR', 'EXEMPTCD', 'EXMPTAMT',
       'PCNTIMPD', 'TAXAMT', 'DOCDATE_YEAR', 'MULTIPARCEL', 'PRICE', 'LENDER1',
       'LOANAMOUT1', 'LOANTYPE1', 'YEARBLT', 'EFFYRBLT', 'LOTSQFT', 'TOTALSF',
       'GRGTYPE', 'BLDGMAT', 'TOTUNITS', 'QUALCLAS', 'BLDGCOND', 'NOSTORY',
       'ROOFMAT', 'FOUNDATN', 'BEDROOMS', 'BATHROOMS', 'FAMILYRM', 'DININGRM',
       'POOL', 'PATIO', 'FIREPLCE', 'AIRMTHOD', 'HEATMTHD', 'VIEW',
       'Did it sell?', 'GRANT DEED', 'IN LIEU OF FORC', 'INTERSPOUSAL',
       'JOINT TENANT', 'NAME CHANGE', 'Other', 'PARTNERSHIP', 'PERSONAL REP',
       'QUIT CLAIM', 'RE-RECORD', 'TAX DEED', 'TRUST TRANSFER', 'TRUSTEES',
       'WARRANTY', 'Prediction', 'Actual'],
      dtype='object')

In [46]:
#model_4.support_vectors_

In [47]:
# Fit/train our model using the scaled training data
model_4.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [48]:
# Make predictions using the test data
y_pred = model_4.predict(X_test_scaled)
results_SVM = pd.DataFrame({
    "Prediction": y_pred,
    "Actual": y_test
}).reset_index(drop=True)

In [49]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9311809685641461

In [50]:
balanced_accuracy_score(y_test, y_pred)

0.5

In [51]:
f1_score(y_test, y_pred, average="weighted")

0.8979976608414467

In [52]:
# create our confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1096,    0],
       [  81,    0]])

In [53]:
# evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1096
           1       0.00      0.00      0.00        81

    accuracy                           0.93      1177
   macro avg       0.47      0.50      0.48      1177
weighted avg       0.87      0.93      0.90      1177



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Attempt 5: Reduce the number of columns to just the important ones and run an SVC

In [54]:
# REimport our file

# Beverly_Grove_df = pd.read_csv('Beverly_Grove_Cleaned.csv', index_col='PARCEL')
# Beverly_Grove_df.head()

In [55]:
Beverly_Grove_Reduced_df = Beverly_Grove_df[['MAIL DIFFERENT FROM SITE?',
                                             'ASSDTOTAL',
                                             'ASSDLAND',
                                             'ASSDSTCT',
                                             'BLDGCOND',
                                             'TAXAMT',
                                             'PRICE',
                                             'YEARBLT',
                                             'EFFYRBLT',
                                             'LOANAMOUT1',
                                             'Did it sell?']]

In [56]:
#Beverly_Grove_Reduced_df.set_index(['PARCEL'], inplace=True)

In [57]:
Beverly_Grove_Reduced_df.head(3)

Unnamed: 0,MAIL DIFFERENT FROM SITE?,ASSDTOTAL,ASSDLAND,ASSDSTCT,BLDGCOND,TAXAMT,PRICE,YEARBLT,EFFYRBLT,LOANAMOUT1,Did it sell?
0,0,1157385,925908,231477,1,14144,1140000,1905,1938,775200,0
1,1,57047,40994,16053,0,858,0,1906,1916,0,0
2,0,651485,521193,130292,0,8044,0,1909,1937,0,0


In [58]:
len(Beverly_Grove_Reduced_df)

4705

In [59]:
# create new X and y values based on the Beverly_Grove_Reduced dataframe

y_2 = Beverly_Grove_Reduced_df['Did it sell?']
X_2 = Beverly_Grove_Reduced_df[['MAIL DIFFERENT FROM SITE?',
                              'ASSDTOTAL',
                              'ASSDLAND',
                              'ASSDSTCT',
                              'BLDGCOND',
                              'TAXAMT',
                              'PRICE',
                              'YEARBLT',
                              'EFFYRBLT',
                              'LOANAMOUT1']]

In [60]:
# create new training and testing data

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, random_state=78, stratify=y)

In [61]:
# Scale our new training and test data

# Create the StandardScaler instnce
scaler_2 = StandardScaler()

In [62]:
# Fit our scaler, named 'scaler_2' to our data, which produced a new 
# Standard Scaler object, which we call X_scaler_2
X_scaler_2 = scaler_2.fit(X_train_2)

In [63]:
# Scale the data
X_train_2_scaled = X_scaler_2.transform(X_train_2)
X_test_2_scaled = X_scaler_2.transform(X_test_2)

In [64]:
# Instantiate a linear SVM model
from sklearn.svm import SVC
model_5 = SVC(kernel='linear')

In [65]:
# Fit/train our model using the scaled training data
model_5.fit(X_train_2_scaled, y_train_2)

SVC(kernel='linear')

In [66]:
# Make predictions using the test data
y_pred_2 = model_5.predict(X_test_2_scaled)
results_SVM = pd.DataFrame({
    "Prediction": y_pred_2,
    "Actual": y_test_2
}).reset_index(drop=True)

In [67]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_2, y_pred_2)

0.9311809685641461

In [68]:
balanced_accuracy_score(y_test_2, y_pred_2)

0.5

In [69]:
f1_score(y_test_2, y_pred_2, average="weighted")

0.8979976608414467

In [70]:
cm = confusion_matrix(y_test_2, y_pred_2)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [71]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print("Classification Report")
print(classification_report(y_test_2, y_pred_2))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1096,0
Actual 1,81,0


Classification Report
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1096
           1       0.00      0.00      0.00        81

    accuracy                           0.93      1177
   macro avg       0.47      0.50      0.48      1177
weighted avg       0.87      0.93      0.90      1177



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Attempt 6: Decision Tree

In [72]:
from sklearn import tree

# Instantiate our learning model
model_6 = tree.DecisionTreeClassifier()

In [73]:
# Fit our model to our trained, scaled data
model_6 = model_6.fit(X_train_scaled, y_train)

In [74]:
# Make predictions using the testing data
predictions = model_6.predict(X_test_scaled)

# check your predictions
predictions[:20]

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [75]:
# Evaluate our model
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8572642310960068

In [76]:
# balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.4888933946111562

In [77]:
# generate the F1 score
f1_score(y_test, predictions, average="weighted")

0.8631545579770001

In [78]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1004,92
Actual 1,76,5


Accuracy Score : 0.8572642310960068
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      1096
           1       0.05      0.06      0.06        81

    accuracy                           0.86      1177
   macro avg       0.49      0.49      0.49      1177
weighted avg       0.87      0.86      0.86      1177



## Attempt 7: Random Forest

In [79]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)

# Remember, the accuracy_score function requires 2 arguments, the test and the pred
print(f" Random forest predictive accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Random forest predictive accuracy: 0.931


In [80]:
acc_score = accuracy_score(y_test, y_pred)

In [81]:
balanced_accuracy_score(y_test, y_pred)

0.5

In [82]:
# generate the F1 score
f1_score(y_test, predictions, average="weighted")

0.8631545579770001

In [83]:
cm_RandomForest = confusion_matrix(y_test, y_pred)
cm_RandomForest_df = pd.DataFrame(
    cm_RandomForest, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [84]:
# Displaying results
print("Confusion Matrix")
display(cm_RandomForest_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1096,0
Actual 1,81,0


Accuracy Score : 0.9311809685641461
Classification Report
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1096
           1       0.00      0.00      0.00        81

    accuracy                           0.93      1177
   macro avg       0.47      0.50      0.48      1177
weighted avg       0.87      0.93      0.90      1177



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Attempt 8: Gradient Boosting

In [85]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error

In [86]:
# Choose the best learning rate

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.932
Accuracy score (validation): 0.931

Learning rate:  0.1
Accuracy score (training): 0.932
Accuracy score (validation): 0.931

Learning rate:  0.25
Accuracy score (training): 0.934
Accuracy score (validation): 0.929

Learning rate:  0.5
Accuracy score (training): 0.934
Accuracy score (validation): 0.927

Learning rate:  0.75
Accuracy score (training): 0.935
Accuracy score (validation): 0.919

Learning rate:  1
Accuracy score (training): 0.938
Accuracy score (validation): 0.919



In [87]:
# Looks like a learning rate of 0.25 has the best results. 

classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).sample(20)

Unnamed: 0_level_0,Prediction,Actual
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1
5510014010,0,0
5085033013,0,1
5510003010,0,0
5511041013,0,0
5526019013,0,0
5510007013,0,0
5526028001,0,0
5525027015,0,0
5514003045,0,0
5511042016,0,0


In [88]:
# Evaluate the model

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.9269328802039083


In [89]:
balanced_accuracy_score(y_test, predictions)

0.4977189781021898

In [90]:
# generate the F1 score
f1_score(y_test, predictions, average="weighted")

0.8958716373046591

In [91]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1091,5
Actual 1,81,0


# Attempt 9: Adaptive Boosting

In [92]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [93]:
# lets use the training data from the reduced dataframe (the one with fewer columns)

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2), n_estimators=200,
    algorithm='SAMME.R', learning_rate=0.5)

# fit our model to our scaled data
ada_clf.fit(X_train_2_scaled, y_train_2)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.5, n_estimators=200)

In [94]:
# Evaluate the model
y_pred_3 = ada_clf.predict(X_test_2_scaled)

In [95]:
# Remember, the accuracy_score function requires 2 arguments, the test and the pred
print(f" Adaptive Boosting predictive accuracy: {accuracy_score(y_test_2, y_pred_3):.3f}")

 Adaptive Boosting predictive accuracy: 0.912


In [96]:
balanced_accuracy_score(y_test_2, y_pred_3)

0.5013967739028566

In [97]:
f1_score(y_test_2, y_pred_3, average="weighted")

0.8910693347762086

In [98]:
cm_AdaBoost = confusion_matrix(y_test_2, y_pred_3)
cm_AdaBoost_df = pd.DataFrame(
    cm_AdaBoost, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [99]:
# Displaying results
print("Confusion Matrix")
display(cm_AdaBoost_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test_2, y_pred_3))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1072,24
Actual 1,79,2


Accuracy Score : 0.9269328802039083
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1096
           1       0.08      0.02      0.04        81

    accuracy                           0.91      1177
   macro avg       0.50      0.50      0.50      1177
weighted avg       0.87      0.91      0.89      1177



# Attempt 10: Use K-Folds & Then Logistic Regression

In [100]:
# partition our data into training and holdout sets
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import max_error

#X_train_KFolds, X_holdout_KFolds, y_train_KFolds, y_holdout_KFolds = train_test_split(X, y, stratify=y, random_state=78, shuffle=True)

In [101]:
# from sklearn.model_selection import KFold

# for train_index, test_index in KFold(n_splits=5).split(X_train_KFolds):
#     X_data = X_train_KFolds.iloc[train_index]
#     y_data = y_train_KFolds.iloc[train_index]

In [103]:
max_err = 0

for train_index, test_index in KFold(n_splits = 5).split(X):

    model = RandomForestClassifier(
        # our hyperparameters, except n_jobs
        # n_jobs = -1,
        n_estimators = 300,
        max_depth = 5,
        max_features = 'sqrt'
    )
    
    model.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred_KFolds = model.predict(X.iloc[test_index])
    max_err += max_error(y_pred_KFolds, y.iloc[test_index])
    
max_err / 5

1.0

In [106]:
model.score(X, y)

0.9315621679064825

In [107]:
len(test_index)

941

In [109]:
accuracy_score(y.iloc[test_index], y_pred_KFolds)

0.9298618490967057

In [110]:
from sklearn.metrics import classification_report
print(classification_report(y.iloc[test_index], y_pred_KFolds))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       875
           1       0.00      0.00      0.00        66

    accuracy                           0.93       941
   macro avg       0.46      0.50      0.48       941
weighted avg       0.86      0.93      0.90       941



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [105]:
#print(f" KFolds + Random Forest accuracy: {accuracy_score(y_holdout_KFolds, y_pred_KFolds):.3f}")

# Attempt 11: Use a Logistic Regression, but use unscaled data

In [106]:
# Split our data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

In [107]:
# Create the learning model

classifier = LogisticRegression(solver='lbfgs',
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

In [108]:
# fit our learning model to our scaled training data

classifier.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [109]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
17,0,0,5085011011
484,1,0,5510003018
1103,1,0,5089017009
1116,0,0,5510007015
514,1,0,5085007013
456,0,1,5087024008
1106,0,0,5514005032
660,1,0,5089025011
432,0,0,5085031038
185,1,1,5508005013


In [110]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5845369583687341


In [111]:
balanced_accuracy_score(y_test, y_pred)

0.5082342074434532

In [112]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.686189422125401

In [113]:
# See our results in a confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[654, 442],
       [ 47,  34]])

In [114]:
# evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.60      0.73      1096
           1       0.07      0.42      0.12        81

    accuracy                           0.58      1177
   macro avg       0.50      0.51      0.42      1177
weighted avg       0.87      0.58      0.69      1177



# Attempt 12: PCA + Logistic Regression

In [115]:
# bring in our dataframe

Beverly_Grove_df = pd.read_csv('Beverly_Grove_Cleaned.csv', index_col='PARCEL')
Beverly_Grove_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0
5510005006,1,0,6727 Drexel Ave,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0,0
5089024002,0,0,942 S Cochran Ave,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,0,0


In [116]:
# drop any nulls

Beverly_Grove_df = Beverly_Grove_df.dropna()

In [117]:
Beverly_Grove_df['TAXAMT'] = Beverly_Grove_df['TAXAMT'].astype(int)
Beverly_Grove_df['EFFYRBLT'] = Beverly_Grove_df['EFFYRBLT'].astype(int)

In [118]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Beverly_Grove_df = Beverly_Grove_df[~Beverly_Grove_df['SITEADDRESS'].str.contains('#')]
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
# Now drop the SiteAddress column
Beverly_Grove_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [120]:
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,0,0,1,1157385,925908,231477,0,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1,0,0,57047,40994,16053,0,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,0,0,1,651485,521193,130292,0,1,...,0,0,0,0,0,0,0,0,0,0


In [121]:
# Step 1: Create our X and y

y = Beverly_Grove_df['Did it sell?']
X = Beverly_Grove_df.drop(['Did it sell?'], 1)

In [122]:
APN = Beverly_Grove_df.index.values
APN

array([5085010022, 5085033008, 5085032022, ..., 5510018012, 5510017023,
       5514001029])

In [123]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [124]:
X_pca

array([[ 1037442.40088322,   222949.0619781 ,    99008.44211246],
       [ -932464.7141857 ,   251848.62995472,   -90439.60157539],
       [ -393042.03210882,  -290764.40544306,   -27460.470998  ],
       ...,
       [ -314557.49024403,  -369458.72763661,   -27020.9155848 ],
       [ 2038277.22062523,   419857.55993186, -1288530.60059949],
       [ -962055.78924987,   281605.81570128,   -93574.2647082 ]])

In [125]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [126]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [127]:
X_train_scaled_APN = []

In [128]:
# for index, parcel in enumerate(X_train.index.values):
#      X_train_scaled_APN.append(np.insert(X_train_scaled[index], 0, [parcel]))

In [129]:
# Step 5:Create the learning model

log_classifier = LogisticRegression(solver='lbfgs',
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

# Train the model
log_classifier.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)

print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.613


In [130]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
527,0,0,5085008021
105,0,0,5512020025
509,0,0,5085025017
29,0,0,5086017013
823,0,0,5514003014
824,1,0,5526037020
130,0,0,5087019007
667,1,0,4335002005
865,1,0,5526017005
698,1,0,5512012007


In [131]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[693, 403],
       [ 53,  28]])

In [132]:
balanced_accuracy_score(y_test, y_pred)

0.48898914120933584

In [133]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.7081876800594455

In [134]:
# evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.63      0.75      1096
           1       0.06      0.35      0.11        81

    accuracy                           0.61      1177
   macro avg       0.50      0.49      0.43      1177
weighted avg       0.87      0.61      0.71      1177



# Attempt 13: PCA + Oversampling (SMOTE) + LogisticRegression

In [135]:
# bring in our dataframe

Beverly_Grove_df = pd.read_csv('Beverly_Grove_Cleaned.csv', index_col='PARCEL')
Beverly_Grove_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0
5510005006,1,0,6727 Drexel Ave,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0,0
5089024002,0,0,942 S Cochran Ave,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,0,0


In [136]:
# drop any nulls

Beverly_Grove_df = Beverly_Grove_df.dropna()

In [137]:
# Change the datatype of these two columns into ints

Beverly_Grove_df['TAXAMT'] = Beverly_Grove_df['TAXAMT'].astype(int)
Beverly_Grove_df['EFFYRBLT'] = Beverly_Grove_df['EFFYRBLT'].astype(int)

In [138]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Beverly_Grove_df = Beverly_Grove_df[~Beverly_Grove_df['SITEADDRESS'].str.contains('#')]
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0


In [139]:
# Now drop the SiteAddress column
Beverly_Grove_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [140]:
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,0,0,1,1157385,925908,231477,0,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1,0,0,57047,40994,16053,0,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,0,0,1,651485,521193,130292,0,1,...,0,0,0,0,0,0,0,0,0,0


In [141]:
# Step 1: Create our X and y

y = Beverly_Grove_df['Did it sell?']
X = Beverly_Grove_df.drop(['Did it sell?'], 1)

In [142]:
APN = Beverly_Grove_df.index.values
APN

array([5085010022, 5085033008, 5085032022, ..., 5510018012, 5510017023,
       5514001029])

In [143]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [144]:
X_pca

array([[ 1037442.40088322,   222949.06197809,    99008.44211246],
       [ -932464.7141857 ,   251848.62995472,   -90439.60157538],
       [ -393042.03210882,  -290764.40544306,   -27460.47099801],
       ...,
       [ -314557.49024403,  -369458.72763661,   -27020.9155848 ],
       [ 2038277.22062523,   419857.55993186, -1288530.60059949],
       [ -962055.78924987,   281605.81570128,   -93574.2647082 ]])

In [145]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [146]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [147]:
X_train_scaled_APN = []

In [148]:
# Step 5: Resample our data using the SMOTE method

from imblearn.over_sampling import SMOTE
X_resampled_SMOTE, y_resampled_SMOTE = SMOTE(random_state=1, 
                               sampling_strategy='auto').fit_resample(
                               X_train_scaled, y_train)

In [149]:
Counter(y_resampled_SMOTE)

Counter({0: 3287, 1: 3287})

In [150]:
# Step 6: Instantiate our model and fit it to our data

model_7 = LogisticRegression(solver='lbfgs', random_state=78, max_iter=300, class_weight="balanced")
model_7.fit(X_resampled_SMOTE, y_resampled_SMOTE)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [151]:
# Step 7: Calculated the balanced accuracy score

y_pred = model_7.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.4914841849148418

In [152]:
print(f" PCA + Smote + Logistic Regression Was: {accuracy_score(y_test, y_pred):.3f}")

 PCA + Smote + Logistic Regression Was: 0.628


In [153]:
f1_score(y_test, y_pred, average="weighted")

0.719691707488016

In [154]:
confusion_matrix(y_test, y_pred)

array([[712, 384],
       [ 54,  27]])

In [155]:
# evaluate
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.65      0.76      1096
           1       0.07      0.33      0.11        81

    accuracy                           0.63      1177
   macro avg       0.50      0.49      0.44      1177
weighted avg       0.87      0.63      0.72      1177



# Attempt 14: PCA + Resampling (SMOTEEN) + LogisticRegression

In [156]:
# bring in our dataframe

Beverly_Grove_df = pd.read_csv('Beverly_Grove_Cleaned.csv', index_col='PARCEL')
Beverly_Grove_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0
5510005006,1,0,6727 Drexel Ave,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0,0
5089024002,0,0,942 S Cochran Ave,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,0,0


In [157]:
# drop any nulls

Beverly_Grove_df = Beverly_Grove_df.dropna()

In [158]:
# Change the datatype of these two columns into ints

Beverly_Grove_df['TAXAMT'] = Beverly_Grove_df['TAXAMT'].astype(int)
Beverly_Grove_df['EFFYRBLT'] = Beverly_Grove_df['EFFYRBLT'].astype(int)

In [159]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Beverly_Grove_df = Beverly_Grove_df[~Beverly_Grove_df['SITEADDRESS'].str.contains('#')]
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
# Now drop the SiteAddress column
Beverly_Grove_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [161]:
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,0,0,1,1157385,925908,231477,0,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1,0,0,57047,40994,16053,0,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,0,0,1,651485,521193,130292,0,1,...,0,0,0,0,0,0,0,0,0,0


In [162]:
# Step 1: Create our X and y

y = Beverly_Grove_df['Did it sell?']
X = Beverly_Grove_df.drop(['Did it sell?'], 1)

In [163]:
APN = Beverly_Grove_df.index.values
APN

array([5085010022, 5085033008, 5085032022, ..., 5510018012, 5510017023,
       5514001029])

In [164]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [165]:
X_pca

array([[ 1037442.40088321,   222949.06197813,    99008.44211243],
       [ -932464.7141857 ,   251848.62995472,   -90439.60157539],
       [ -393042.03210882,  -290764.40544306,   -27460.470998  ],
       ...,
       [ -314557.49024403,  -369458.72763661,   -27020.9155848 ],
       [ 2038277.22062523,   419857.55993186, -1288530.60059949],
       [ -962055.78924987,   281605.81570128,   -93574.2647082 ]])

In [166]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [167]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [168]:
X_train_scaled_APN = []

In [169]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 1542, 1: 1893})

In [170]:
model_8 = LogisticRegression(solver='lbfgs', random_state=78, max_iter=300, class_weight="balanced")
model_8.fit(X_resampled, y_resampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [171]:
# Calculated the balanced accuracy score
y_pred = model_8.predict(X_test_scaled)

In [172]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.4869221411192214

In [173]:
print(f" PCA + SMOTEEN + Logistic Regression Accuracy was: {accuracy_score(y_test, y_pred):.3f}")

 PCA + SMOTEEN + Logistic Regression Accuracy was: 0.619


In [174]:
f1_score(y_test, y_pred, average="weighted")

0.7133305329304763

In [175]:
# evaluate
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.64      0.76      1096
           1       0.06      0.33      0.11        81

    accuracy                           0.62      1177
   macro avg       0.50      0.49      0.43      1177
weighted avg       0.87      0.62      0.71      1177



# Attempt 15: PCA + Undersampling (Centroid Clustering) + Logistic Regression

In [176]:
# bring in our dataframe

Beverly_Grove_df = pd.read_csv('Beverly_Grove_Cleaned.csv', index_col='PARCEL')
Beverly_Grove_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0
5510005006,1,0,6727 Drexel Ave,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0,0
5089024002,0,0,942 S Cochran Ave,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,0,0


In [177]:
# drop any nulls

Beverly_Grove_df = Beverly_Grove_df.dropna()

In [178]:
# Change the datatype of these two columns into ints

Beverly_Grove_df['TAXAMT'] = Beverly_Grove_df['TAXAMT'].astype(int)
Beverly_Grove_df['EFFYRBLT'] = Beverly_Grove_df['EFFYRBLT'].astype(int)

In [179]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Beverly_Grove_df = Beverly_Grove_df[~Beverly_Grove_df['SITEADDRESS'].str.contains('#')]
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0


In [180]:
# Now drop the SiteAddress column
Beverly_Grove_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [181]:
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,0,0,1,1157385,925908,231477,0,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1,0,0,57047,40994,16053,0,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,0,0,1,651485,521193,130292,0,1,...,0,0,0,0,0,0,0,0,0,0


In [182]:
# Step 1: Create our X and y

y = Beverly_Grove_df['Did it sell?']
X = Beverly_Grove_df.drop(['Did it sell?'], 1)

In [183]:
APN = Beverly_Grove_df.index.values
APN

array([5085010022, 5085033008, 5085032022, ..., 5510018012, 5510017023,
       5514001029])

In [184]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [185]:
X_pca

array([[ 1037442.40088322,   222949.06197811,    99008.44211245],
       [ -932464.7141857 ,   251848.62995472,   -90439.60157539],
       [ -393042.03210882,  -290764.40544305,   -27460.470998  ],
       ...,
       [ -314557.49024403,  -369458.72763661,   -27020.9155848 ],
       [ 2038277.22062523,   419857.55993186, -1288530.60059949],
       [ -962055.78924987,   281605.81570128,   -93574.26470819]])

In [186]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [187]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [188]:
X_train_scaled_APN = []

In [189]:
# Fit the data using `ClusterCentroids` and check the count of each class
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_CC, y_CC = cc.fit_resample(X_train_scaled, y_train)
Counter(y_CC)

Counter({0: 241, 1: 241})

In [190]:
# Logistic regression using cluster centroid undersampled data
from sklearn.linear_model import LogisticRegression
model_9 = LogisticRegression(solver='lbfgs', random_state=78)
model_9.fit(X_CC, y_CC)

LogisticRegression(random_state=78)

In [191]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model_9.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[229, 867],
       [ 18,  63]])

In [192]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.49335969180859696

In [193]:
print(f" PCA + Centroid Clustering + Logistic Regression was: {accuracy_score(y_test, y_pred):.3f}")

 PCA + Centroid Clustering + Logistic Regression was: 0.248


In [194]:
f1_score(y_test, y_pred, average="weighted")

0.3261352171412433

In [195]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.21      0.78      0.34      0.40      0.15      1096
          1       0.07      0.78      0.21      0.12      0.40      0.17        81

avg / total       0.87      0.25      0.74      0.33      0.40      0.15      1177



In [196]:
# Note how using the Centroid Clusters creates a model that predicts a much higher number of sales. 
# When we decrease the number of non-sale properties, the model curiously predicts more sale properties. 
# What does this imply? Is it that the sale properties are indistinct from the non-sales, and evening out the classes, 
# creates an increased likelihood predicting sale? 

## Attempt 16: PCA + Oversampling (Random Oversampling) + Logistic Regression

In [159]:
# bring in our dataframe

Beverly_Grove_df = pd.read_csv('Beverly_Grove_Cleaned.csv', index_col='PARCEL')
Beverly_Grove_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0
5510005006,1,0,6727 Drexel Ave,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0,0
5089024002,0,0,942 S Cochran Ave,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
# drop any nulls

Beverly_Grove_df = Beverly_Grove_df.dropna()

In [161]:
# Change the datatype of these two columns into ints

Beverly_Grove_df['TAXAMT'] = Beverly_Grove_df['TAXAMT'].astype(int)
Beverly_Grove_df['EFFYRBLT'] = Beverly_Grove_df['EFFYRBLT'].astype(int)

In [162]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Beverly_Grove_df = Beverly_Grove_df[~Beverly_Grove_df['SITEADDRESS'].str.contains('#')]
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
# Now drop the SiteAddress column
Beverly_Grove_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [164]:
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,0,0,1,1157385,925908,231477,0,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1,0,0,57047,40994,16053,0,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,0,0,1,651485,521193,130292,0,1,...,0,0,0,0,0,0,0,0,0,0


In [165]:
# Step 1: Create our X and y

y = Beverly_Grove_df['Did it sell?']
X = Beverly_Grove_df.drop(['Did it sell?'], 1)

In [166]:
APN = Beverly_Grove_df.index.values
APN

array([5085010022, 5085033008, 5085032022, ..., 5510018012, 5510017023,
       5514001029])

In [167]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [168]:
X_pca

array([[ 1037442.40088322,   222949.06197811,    99008.44211245],
       [ -932464.7141857 ,   251848.62995472,   -90439.60157538],
       [ -393042.03210882,  -290764.40544306,   -27460.47099801],
       ...,
       [ -314557.49024403,  -369458.72763661,   -27020.9155848 ],
       [ 2038277.22062523,   419857.55993186, -1288530.60059949],
       [ -962055.78924987,   281605.81570128,   -93574.2647082 ]])

In [169]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [170]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [171]:
X_train_scaled_APN = []

In [172]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({0: 3287, 1: 3287})

In [173]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model_10 = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model_10.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [174]:
# Evaluate the model
y_pred = model_10.predict(X_test_scaled)

In [175]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.621


In [176]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.49355118500495626

In [177]:
f1_score(y_test, y_pred, average="weighted")

0.7146103073370367

In [178]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.64      0.35      0.76      0.47      0.23      1096
          1       0.07      0.35      0.64      0.11      0.47      0.22        81

avg / total       0.87      0.62      0.37      0.71      0.47      0.23      1177



In [179]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
1038,1,0,4335003015
603,0,0,5527025002
149,0,0,5088017010
218,0,1,5085011014
77,0,0,5085022022
537,0,0,5528004080
832,1,0,5085019022
491,0,0,5510014025
878,0,0,5089020019
154,0,0,5086011018


In [180]:
Beverly_Grove_Results_SFRs_df = Beverly_Grove_df.merge(results, how="left", right_on="PARCEL", left_on="PARCEL")
Beverly_Grove_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,0,0,1,1157385,925908,231477,0,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1,0,0,57047,40994,16053,0,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,0,0,1,651485,521193,130292,0,1,...,0,0,0,0,0,0,0,0,0,0
5510005006,1,0,1,0,0,85697,74539,11158,0,0,...,0,0,0,0,0,0,0,0,0,0
5089024002,0,0,0,0,0,62487,42013,20474,0,1,...,0,0,0,0,0,0,0,0,0,0


In [181]:
Beverly_Grove_Results_SFRs_df

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY,Prediction,Actual
0,5085010022,0,0,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,,
1,5085033008,0,0,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0.0,0.0
2,5085032022,0,0,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,,
3,5510005006,1,0,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0.0,0.0
4,5089024002,0,0,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4700,4335003015,0,1,1,0,0,1177690,1177690,0,0,...,0,0,0,0,0,0,0,0,1.0,0.0
4701,4335010018,0,1,1,0,1,1346195,1346195,0,0,...,0,0,0,0,0,0,0,0,,
4702,5510018012,1,0,0,0,0,762455,509154,253301,0,...,0,0,0,0,0,0,0,0,,
4703,5510017023,0,1,1,1,1,1553301,1087311,465990,0,...,0,0,0,0,0,0,0,0,,


In [183]:
Beverly_Grove_Results_SFRs_df.to_csv('Beverly_Grove_Results_SFRs.csv')

In [184]:
# Now do the Beverly Grove Condos

In [234]:
# bring in our dataframe

Beverly_Grove_df = pd.read_csv('Beverly_Grove_Cleaned.csv', index_col='PARCEL')
Beverly_Grove_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5085010022,0,0,1030 S Dunsmuir Ave,0,0,1,1157385,925908,231477,0,...,0,0,0,0,0,0,0,0,0,0
5085033008,0,0,1364 S Ridgeley Dr,1,0,0,57047,40994,16053,0,...,0,0,0,0,0,0,0,0,0,0
5085032022,0,0,1322 Hauser Blvd,0,0,1,651485,521193,130292,0,...,0,0,0,0,0,0,0,0,0,0
5510005006,1,0,6727 Drexel Ave,1,0,0,85697,74539,11158,0,...,0,0,0,0,0,0,0,0,0,0
5089024002,0,0,942 S Cochran Ave,0,0,0,62487,42013,20474,0,...,0,0,0,0,0,0,0,0,0,0


In [235]:
# drop any nulls

Beverly_Grove_df = Beverly_Grove_df.dropna()

In [236]:
# Change the datatype of these two columns into ints

Beverly_Grove_df['TAXAMT'] = Beverly_Grove_df['TAXAMT'].astype(int)
Beverly_Grove_df['EFFYRBLT'] = Beverly_Grove_df['EFFYRBLT'].astype(int)

In [237]:
# Make sure our dataframe is only Condos so exclude every property that doesn't have a '#' in its site address

Beverly_Grove_df = Beverly_Grove_df[Beverly_Grove_df['SITEADDRESS'].str.contains('#')]
Beverly_Grove_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PARTNERSHIP,PERSONAL REP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5087012023,0,0,1040 S La Jolla Ave #VE,0,0,0,289000,116000,173000,0,...,0,0,0,0,0,0,0,1,0,0
4332020041,0,0,1045 S Bedford St #9,1,0,0,151323,41266,110057,0,...,0,0,0,0,0,0,0,0,0,0
4332020051,0,0,1045 S Bedford St #1,0,0,0,376863,223789,153074,0,...,0,0,0,0,0,0,0,0,0,0


In [238]:
# Now drop the SiteAddress column
Beverly_Grove_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [239]:
# Step 1: Create our X and y

y = Beverly_Grove_df['Did it sell?']
X = Beverly_Grove_df.drop(['Did it sell?'], 1)

In [240]:
APN = Beverly_Grove_df.index.values
APN

array([5087012023, 4332020041, 4332020051, ..., 5089003038, 5089003039,
       5089003040])

In [241]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [242]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [243]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [244]:
X_train_scaled_APN = []

In [245]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({0: 1380, 1: 1380})

In [246]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [247]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [248]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.566


In [249]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.48128019323671495

In [250]:
f1_score(y_test, y_pred, average="weighted")

0.6593455279133104

In [251]:
# See our results in a confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[269, 191],
       [ 28,  17]])

In [252]:
# evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.58      0.71       460
           1       0.08      0.38      0.13        45

    accuracy                           0.57       505
   macro avg       0.49      0.48      0.42       505
weighted avg       0.83      0.57      0.66       505

