In [1]:
# import our dependencies

%matplotlib inline
from collections import Counter
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import numpy as np

## PCA + Oversampling (Random Oversampling) + Logistic Regression (SFRs)

In [2]:
# bring in our dataframe

Hollywood_Hills_df = pd.read_csv('Hollywood_Hills_Cleaned.csv', index_col='PARCEL')
Hollywood_Hills_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,INTERSPOUSAL,JOINT TENANT,NAME CHANGE,Other,PERSONAL REP,QUIT CLAIM,RE-RECORD,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5567023035,0,0,1821 Oakden Dr,0,#REF!,1,692637,577201,115436,0,...,0,0,0,1,0,0,0,0,0,0
5569007070,0,0,2235 Hercules Dr,1,0,1,3694769,2126298,1568471,0,...,0,0,0,1,0,0,0,0,0,0
5563024001,0,0,8760 Lookout Mountain Ave,1,#REF!,1,110955,110955,0,0,...,0,0,0,1,0,0,0,0,0,0
5564008023,0,0,8324 Skyline Dr,0,#REF!,1,635221,206629,428592,0,...,0,0,0,1,0,0,0,0,0,0
5560010011,0,0,9191 Thrasher Ave,1,1,1,9028955,6526957,2501998,0,...,0,0,0,1,0,0,0,0,0,0


In [4]:
# drop any nulls

Hollywood_Hills_df = Hollywood_Hills_df.dropna()

In [5]:
len(Hollywood_Hills_df)

6066

In [6]:
# Change the datatype of these two columns into ints

Hollywood_Hills_df['TAXAMT'] = Hollywood_Hills_df['TAXAMT'].astype(int)
Hollywood_Hills_df['EFFYRBLT'] = Hollywood_Hills_df['EFFYRBLT'].astype(int)

In [7]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Hollywood_Hills_df = Hollywood_Hills_df[~Hollywood_Hills_df['SITEADDRESS'].str.contains('#')]
Hollywood_Hills_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,INTERSPOUSAL,JOINT TENANT,NAME CHANGE,Other,PERSONAL REP,QUIT CLAIM,RE-RECORD,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5567023035,0,0,1821 Oakden Dr,0,#REF!,1,692637,577201,115436,0,...,0,0,0,1,0,0,0,0,0,0
5569007070,0,0,2235 Hercules Dr,1,0,1,3694769,2126298,1568471,0,...,0,0,0,1,0,0,0,0,0,0
5563024001,0,0,8760 Lookout Mountain Ave,1,#REF!,1,110955,110955,0,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
len(Hollywood_Hills_df)

6061

In [11]:
# Now drop the SiteAddress column
Hollywood_Hills_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [12]:
Hollywood_Hills_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,INTERSPOUSAL,JOINT TENANT,NAME CHANGE,Other,PERSONAL REP,QUIT CLAIM,RE-RECORD,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5567023035,0,0,0,#REF!,1,692637,577201,115436,0,7000,...,0,0,0,1,0,0,0,0,0,0
5569007070,0,0,1,0,1,3694769,2126298,1568471,0,0,...,0,0,0,1,0,0,0,0,0,0
5563024001,0,0,1,#REF!,1,110955,110955,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [14]:
APN = Hollywood_Hills_df.index.values
APN

array([5567023035, 5569007070, 5563024001, ..., 2380003028, 5575017033,
       5549025005])

In [19]:
Hollywood_Hills_df['MAIL OUTSIDE CA?'] = Hollywood_Hills_df['MAIL OUTSIDE CA?'].transform(lambda x: x != '0')

In [20]:
Hollywood_Hills_df['MAIL OUTSIDE CA?'].value_counts()

False    5108
True      953
Name: MAIL OUTSIDE CA?, dtype: int64

In [21]:
Hollywood_Hills_df['MAIL OUTSIDE CA?'] = Hollywood_Hills_df['MAIL OUTSIDE CA?'].astype(int)

In [23]:
Hollywood_Hills_df.dtypes

Owned by Trust?                int64
Owned by Business?             int64
MAIL DIFFERENT FROM SITE?      int64
MAIL OUTSIDE CA?               int64
TITLECO1                       int64
ASSDTOTAL                      int64
ASSDLAND                       int64
ASSDSTCT                       int64
ASSDOTHR                       int64
EXEMPTCD                       int64
EXMPTAMT                       int64
PCNTIMPD                       int64
TAXAMT                         int64
DOCDATE_YEAR                   int64
MULTIPARCEL                    int64
PRICE                          int64
LENDER1                        int64
LOANAMOUT1                     int64
LOANTYPE1                      int64
YEARBLT                        int64
EFFYRBLT                       int64
LOTSQFT                        int64
TOTALSF                        int64
GRGTYPE                        int64
BLDGMAT                        int64
TOTUNITS                       int64
QUALCLAS                       int64
B

In [26]:
Hollywood_Hills_df['NOSTORY'] = Hollywood_Hills_df['NOSTORY'].astype(int)

In [28]:
# Step 1: Create our X and y

y = Hollywood_Hills_df['Did it sell?']
X = Hollywood_Hills_df.drop(['Did it sell?'], 1)

In [29]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [30]:
X_pca

array([[-1116577.034765  ,    64670.11930474,   269005.16280441],
       [ 2965732.6196906 ,   667886.88110096,   729731.15449236],
       [-1038865.0201991 , -1074437.95891631,  -705481.80270765],
       ...,
       [ -260677.34597469,  -770960.83909605,   611868.33367603],
       [-1695034.38402514,  -230573.96160672,  -140829.58384411],
       [-1470840.29416024,  -328492.88108064,  -125709.87384689]])

In [31]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [32]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [33]:
X_train_scaled_APN = []

In [34]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({1: 3785, 0: 3785})

In [35]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [36]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [37]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.663


In [38]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5475467012740682

In [39]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.6953758926063851

In [40]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.85      0.72      0.37      0.78      0.52      0.28      1262
          1       0.21      0.37      0.72      0.27      0.52      0.26       254

avg / total       0.74      0.66      0.43      0.70      0.52      0.28      1516



In [41]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
923,0,1,5560012004
30,0,0,5565002001
1369,0,0,4392017020
1094,0,0,5564022005
273,0,0,5567016042
614,1,0,5556029005
1242,0,0,5563002025
626,0,0,2425012020
955,0,0,5556031010
508,0,0,5570008007


In [42]:
Hollywood_Hills_Results_SFRs_df = Hollywood_Hills_df.merge(results, how="left", right_on="PARCEL", left_on="PARCEL")
Hollywood_Hills_Results_SFRs_df.head()

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PERSONAL REP,QUIT CLAIM,RE-RECORD,TRUST TRANSFER,TRUSTEES,WARRANTY,Prediction,Actual
0,5567023035,0,0,0,1,1,692637,577201,115436,0,...,0,1,0,0,0,0,0,0,,
1,5569007070,0,0,1,0,1,3694769,2126298,1568471,0,...,0,1,0,0,0,0,0,0,1.0,0.0
2,5563024001,0,0,1,1,1,110955,110955,0,0,...,0,1,0,0,0,0,0,0,,
3,5564008023,0,0,0,1,1,635221,206629,428592,0,...,0,1,0,0,0,0,0,0,,
4,5560010011,0,0,1,1,1,9028955,6526957,2501998,0,...,0,1,0,0,0,0,0,0,,


In [43]:
Hollywood_Hills_Results_SFRs_df

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,NAME CHANGE,Other,PERSONAL REP,QUIT CLAIM,RE-RECORD,TRUST TRANSFER,TRUSTEES,WARRANTY,Prediction,Actual
0,5567023035,0,0,0,1,1,692637,577201,115436,0,...,0,1,0,0,0,0,0,0,,
1,5569007070,0,0,1,0,1,3694769,2126298,1568471,0,...,0,1,0,0,0,0,0,0,1.0,0.0
2,5563024001,0,0,1,1,1,110955,110955,0,0,...,0,1,0,0,0,0,0,0,,
3,5564008023,0,0,0,1,1,635221,206629,428592,0,...,0,1,0,0,0,0,0,0,,
4,5560010011,0,0,1,1,1,9028955,6526957,2501998,0,...,0,1,0,0,0,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6056,5565026020,0,0,0,0,1,449245,374376,74869,0,...,0,1,0,0,0,0,0,0,0.0,0.0
6057,2380003027,0,1,0,0,1,647208,517767,129441,0,...,0,1,0,0,0,0,0,0,,
6058,2380003028,0,1,0,0,1,811382,649106,162276,0,...,0,1,0,0,0,0,0,0,1.0,0.0
6059,5575017033,0,0,0,0,1,201165,116062,85103,0,...,0,0,0,1,0,0,0,0,,


In [44]:
Hollywood_Hills_Results_SFRs_df.to_csv('Hollywood_Hills_Results_SFRs.csv')

# PCA + Oversampling (Random Oversampling) + Logistic Regression (Condos)

In [45]:
# bring in our dataframe

Hollywood_Hills_df = pd.read_csv('Hollywood_Hills_Cleaned.csv', index_col='PARCEL')
Hollywood_Hills_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,INTERSPOUSAL,JOINT TENANT,NAME CHANGE,Other,PERSONAL REP,QUIT CLAIM,RE-RECORD,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5567023035,0,0,1821 Oakden Dr,0,#REF!,1,692637,577201,115436,0,...,0,0,0,1,0,0,0,0,0,0
5569007070,0,0,2235 Hercules Dr,1,0,1,3694769,2126298,1568471,0,...,0,0,0,1,0,0,0,0,0,0
5563024001,0,0,8760 Lookout Mountain Ave,1,#REF!,1,110955,110955,0,0,...,0,0,0,1,0,0,0,0,0,0
5564008023,0,0,8324 Skyline Dr,0,#REF!,1,635221,206629,428592,0,...,0,0,0,1,0,0,0,0,0,0
5560010011,0,0,9191 Thrasher Ave,1,1,1,9028955,6526957,2501998,0,...,0,0,0,1,0,0,0,0,0,0


In [46]:
# drop any nulls

Hollywood_Hills_df = Hollywood_Hills_df.dropna()

In [47]:
len(Hollywood_Hills_df)

6066

In [32]:
# Change the datatype of these two columns into ints

Hollywood_Hills_df['TAXAMT'] = Hollywood_Hills_df['TAXAMT'].astype(int)
Hollywood_Hills_df['EFFYRBLT'] = Hollywood_Hills_df['EFFYRBLT'].astype(int)

In [48]:
# Make sure our dataframe is only Condos

Hollywood_Hills_df = Hollywood_Hills_df[Hollywood_Hills_df['SITEADDRESS'].str.contains('#')]
Hollywood_Hills_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,INTERSPOUSAL,JOINT TENANT,NAME CHANGE,Other,PERSONAL REP,QUIT CLAIM,RE-RECORD,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5572031087,0,0,7218 Hillside Ave #207,1,0,1,517883,197793,320090,0,...,0,0,0,1,0,0,0,0,0,0
5569027024,1,0,2435 Achilles Dr #50,1,0,0,196347,58852,137495,0,...,0,0,0,1,0,0,0,0,0,0
5564018006,0,0,2355 Sunset Heights Dr #19THFLR,1,0,1,672896,538322,134574,0,...,0,0,0,1,0,0,0,0,0,0


In [49]:
len(Hollywood_Hills_df)

5

In [50]:
# Now drop the SiteAddress column
Hollywood_Hills_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [51]:
Hollywood_Hills_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,INTERSPOUSAL,JOINT TENANT,NAME CHANGE,Other,PERSONAL REP,QUIT CLAIM,RE-RECORD,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5572031087,0,0,1,0,1,517883,197793,320090,0,0,...,0,0,0,1,0,0,0,0,0,0
5569027024,1,0,1,0,0,196347,58852,137495,0,0,...,0,0,0,1,0,0,0,0,0,0
5564018006,0,0,1,0,1,672896,538322,134574,0,0,...,0,0,0,1,0,0,0,0,0,0


In [52]:
Hollywood_Hills_df['NOSTORY'] = Hollywood_Hills_df['NOSTORY'].astype(int)

In [53]:
Hollywood_Hills_df['MAIL OUTSIDE CA?'] = Hollywood_Hills_df['MAIL OUTSIDE CA?'].transform(lambda x: x != '0')

In [54]:
# Step 1: Create our X and y

y = Hollywood_Hills_df['Did it sell?']
X = Hollywood_Hills_df.drop(['Did it sell?'], 1)

In [56]:
APN = Hollywood_Hills_df.index.values
APN

array([5572031087, 5569027024, 5564018006, 5556031026, 5556005006])

In [57]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [58]:
X_pca

array([[-322583.49097111,  182460.73204971,  184574.5661939 ],
       [-751912.32789526, -287388.1430832 ,    7442.02570629],
       [-172387.22725217,  488863.2779614 , -111359.71472115],
       [1662371.450207  , -111883.86113427,    5980.89862915],
       [-415488.40408846, -272052.00579364,  -86637.7758082 ]])

In [61]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.