In [1]:
# import our dependencies

%matplotlib inline
from collections import Counter
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import numpy as np
import os

## PCA + Oversampling (Random Oversampling) + Logistic Regression (SFRs)

In [2]:
file = 'Hollywood_Hills_East_Cleaned.csv'

location = '/Users/Admin/Desktop/GW_Bootcamp/Analysis_Projects/Final_Project_Team_1/'

In [3]:
path = os.path.join(location, file)

In [4]:
# bring in our dataframe

Hollywood_Hills_East_df = pd.read_csv(path, index_col='PARCEL')
Hollywood_Hills_East_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,INTERSPOUSAL,JOINT TENANT,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5586017003,1,0,2251 Hollyridge Dr,0,0,1,1214294,871974,342320,0,...,0,0,0,0,0,0,0,0,0,0
5579035002,0,0,3209 Tareco Dr,0,0,1,641575,485483,156092,0,...,0,0,0,0,0,0,0,1,0,0
5585001002,1,0,2780 Creston Dr,0,0,1,2867782,1837599,1030183,0,...,0,0,0,0,0,0,0,0,0,0
5577038025,0,0,6352 Innsdale Dr,0,0,0,1204724,890187,314537,0,...,0,0,0,0,0,0,0,0,0,0
5577036009,0,0,6324 Tahoe Dr,0,0,1,1125742,909097,216645,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# drop any nulls

Hollywood_Hills_East_df = Hollywood_Hills_East_df.dropna()

In [5]:
len(Hollywood_Hills_East_df)

4017

In [6]:
# Change the datatype of these two columns into ints

Hollywood_Hills_East_df['TAXAMT'] = Hollywood_Hills_East_df['TAXAMT'].astype(int)
Hollywood_Hills_East_df['EFFYRBLT'] = Hollywood_Hills_East_df['EFFYRBLT'].astype(int)

In [7]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Hollywood_Hills_East_df = Hollywood_Hills_East_df[~Hollywood_Hills_East_df['SITEADDRESS'].str.contains('#')]
Hollywood_Hills_East_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,INTERSPOUSAL,JOINT TENANT,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5586017003,1,0,2251 Hollyridge Dr,0,0,1,1214294,871974,342320,0,...,0,0,0,0,0,0,0,0,0,0
5579035002,0,0,3209 Tareco Dr,0,0,1,641575,485483,156092,0,...,0,0,0,0,0,0,0,1,0,0
5585001002,1,0,2780 Creston Dr,0,0,1,2867782,1837599,1030183,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
len(Hollywood_Hills_East_df)

3500

In [9]:
# Now drop the SiteAddress column
Hollywood_Hills_East_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [10]:
Hollywood_Hills_East_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,INTERSPOUSAL,JOINT TENANT,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5586017003,1,0,0,0,1,1214294,871974,342320,0,1,...,0,0,0,0,0,0,0,0,0,0
5579035002,0,0,0,0,1,641575,485483,156092,0,1,...,0,0,0,0,0,0,0,1,0,0
5585001002,1,0,0,0,1,2867782,1837599,1030183,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Step 1: Create our X and y

y = Hollywood_Hills_East_df['Did it sell?']
#X = Hollywood_Hills_East_df(['Did it sell?'], 1)
X = Hollywood_Hills_East_df[['Owned by Trust?', 
                            'Owned by Business?', 
                            'MAIL DIFFERENT FROM SITE?',
                            'MAIL OUTSIDE CA?', 
                            'TITLECO1', 
                            'ASSDTOTAL', 
                            'ASSDLAND', 
                            'ASSDSTCT',
                            'ASSDOTHR', 
                            'EXEMPTCD', 
                            'EXMPTAMT', 
                            'PCNTIMPD', 
                            'TAXAMT',
                            'DOCDATE_YEAR', 
                            'MULTIPARCEL', 
                            'PRICE', 
                            'LENDER1', 
                            'LOANAMOUT1',
                            'LOANTYPE1', 
                            'YEARBLT', 
                            'EFFYRBLT', 
                            'LOTSQFT', 
                            'TOTALSF', 
                            'GRGTYPE',
                            'BLDGMAT', 
                            'TOTUNITS', 
                            'QUALCLAS', 
                            'BLDGCOND', 
                            'NOSTORY', 
                            'ROOFMAT',
                            'FOUNDATN', 
                            'BEDROOMS', 
                            'BATHROOMS', 
                            'FAMILYRM', 
                            'DININGRM', 
                            'POOL',
                            'PATIO', 
                            'FIREPLCE', 
                            'AIRMTHOD', 
                            'HEATMTHD', 
                            'VIEW',
                            'GRANT DEED', 
                            'INTERSPOUSAL', 
                            'JOINT TENANT', 
                            'Other', 
                            'PARTNERSHIP',
                            'QUIT CLAIM', 
                            'RE-RECORD', 
                            'TAX DEED', 
                            'TRUST TRANSFER', 
                            'TRUSTEES',
                            'WARRANTY']]

In [25]:
APN = Hollywood_Hills_East_df.index.values
APN

array([5586017003, 5579035002, 5585001002, ..., 2428006041, 2427008043,
       5585017033])

In [26]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [27]:
X_pca

array([[  -22213.12194681,  -727047.17177394,    60768.16195907],
       [ -549842.57195944,  -243774.2651712 ,    71050.5433429 ],
       [ 1802811.29776476, -1712399.79249876,  -946820.32537219],
       ...,
       [ -499566.82568289,  -286604.0866326 ,    71826.58812759],
       [  141761.63488336,  -872511.49716167,    60146.76081435],
       [ -280920.08346389,  -484502.95902273,    68847.94810218]])

In [28]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [29]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
X_train_scaled_APN = []

In [31]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({0: 2322, 1: 2322})

In [32]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [33]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [34]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.632


In [35]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5466331516872618

In [36]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.6967851474335639

In [37]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.66      0.44      0.76      0.54      0.29       774
          1       0.14      0.44      0.66      0.21      0.54      0.28       101

avg / total       0.81      0.63      0.46      0.70      0.54      0.29       875



In [38]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
92,0,0,5586014077
17,0,1,5585023018
542,0,0,2427001005
609,0,1,5582011005
205,0,0,5586021006
456,1,0,5585016030
498,0,0,5583002012
361,1,0,5585008022
76,1,0,5572036015
172,1,0,5583017030


In [39]:
Hollywood_Hills_East_Results_SFRs_df = Hollywood_Hills_East_df.merge(results, how="left", right_on="PARCEL", left_on="PARCEL")
Hollywood_Hills_East_Results_SFRs_df.head()

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY,Prediction,Actual
0,5586017003,1,0,0,0,1,1214294,871974,342320,0,...,0,0,0,0,0,0,0,0,0.0,0.0
1,5579035002,0,0,0,0,1,641575,485483,156092,0,...,0,0,0,0,0,1,0,0,0.0,0.0
2,5585001002,1,0,0,0,1,2867782,1837599,1030183,0,...,0,0,0,0,0,0,0,0,,
3,5577038025,0,0,0,0,0,1204724,890187,314537,0,...,0,0,0,0,0,0,0,0,1.0,0.0
4,5577036009,0,0,0,0,1,1125742,909097,216645,0,...,0,0,0,0,0,0,0,0,,


In [40]:
Hollywood_Hills_East_Results_SFRs_df

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY,Prediction,Actual
0,5586017003,1,0,0,0,1,1214294,871974,342320,0,...,0,0,0,0,0,0,0,0,0.0,0.0
1,5579035002,0,0,0,0,1,641575,485483,156092,0,...,0,0,0,0,0,1,0,0,0.0,0.0
2,5585001002,1,0,0,0,1,2867782,1837599,1030183,0,...,0,0,0,0,0,0,0,0,,
3,5577038025,0,0,0,0,0,1204724,890187,314537,0,...,0,0,0,0,0,0,0,0,1.0,0.0
4,5577036009,0,0,0,0,1,1125742,909097,216645,0,...,0,0,0,0,0,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,5586018040,0,0,0,0,1,522028,378896,143132,0,...,0,0,0,0,0,0,0,0,,
3496,5585017031,0,0,0,0,0,564590,311276,253314,0,...,0,0,0,0,0,0,0,0,,
3497,2428006041,1,0,1,1,0,686625,551951,134674,0,...,0,0,0,0,0,0,0,0,,
3498,2427008043,0,0,1,1,0,1378322,1035555,342767,0,...,0,0,0,0,0,0,0,0,,


In [41]:
Hollywood_Hills_East_Results_SFRs_df.to_csv('Hollywood_Hills_East_Results_SFRs.csv')

# PCA + Oversampling (Random Oversampling) + Logistic Regression (Condos)

In [42]:
# bring in our dataframe

Hollywood_Hills_East_df = pd.read_csv(path, index_col='PARCEL')
Hollywood_Hills_East_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,INTERSPOUSAL,JOINT TENANT,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5586017003,1,0,2251 Hollyridge Dr,0,0,1,1214294,871974,342320,0,...,0,0,0,0,0,0,0,0,0,0
5579035002,0,0,3209 Tareco Dr,0,0,1,641575,485483,156092,0,...,0,0,0,0,0,0,0,1,0,0
5585001002,1,0,2780 Creston Dr,0,0,1,2867782,1837599,1030183,0,...,0,0,0,0,0,0,0,0,0,0
5577038025,0,0,6352 Innsdale Dr,0,0,0,1204724,890187,314537,0,...,0,0,0,0,0,0,0,0,0,0
5577036009,0,0,6324 Tahoe Dr,0,0,1,1125742,909097,216645,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# drop any nulls

Hollywood_Hills_East_df = Hollywood_Hills_East_df.dropna()

In [44]:
len(Hollywood_Hills_East_df)

4017

In [45]:
# Change the datatype of these two columns into ints

Hollywood_Hills_East_df['TAXAMT'] = Hollywood_Hills_East_df['TAXAMT'].astype(int)
Hollywood_Hills_East_df['EFFYRBLT'] = Hollywood_Hills_East_df['EFFYRBLT'].astype(int)

In [46]:
# Make sure our dataframe is only Condos

Hollywood_Hills_East_df = Hollywood_Hills_East_df[Hollywood_Hills_East_df['SITEADDRESS'].str.contains('#')]
Hollywood_Hills_East_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,INTERSPOUSAL,JOINT TENANT,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5576001050,0,0,2260 N Cahuenga Blvd #308,0,0,1,233350,125784,107566,0,...,0,0,0,0,0,0,0,0,0,0
5579039019,0,0,3033 Hollycrest Dr #3,1,0,0,131234,52489,78745,0,...,0,0,1,0,0,0,0,0,0,0
5579039020,0,0,3033 Hollycrest Dr #4,1,0,0,131234,52489,78745,0,...,0,0,1,0,0,0,0,0,0,0


In [47]:
len(Hollywood_Hills_East_df)

517

In [48]:
# Now drop the SiteAddress column
Hollywood_Hills_East_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [49]:
Hollywood_Hills_East_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,INTERSPOUSAL,JOINT TENANT,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5576001050,0,0,0,0,1,233350,125784,107566,0,1,...,0,0,0,0,0,0,0,0,0,0
5579039019,0,0,1,0,0,131234,52489,78745,0,0,...,0,0,1,0,0,0,0,0,0,0
5579039020,0,0,1,0,0,131234,52489,78745,0,0,...,0,0,1,0,0,0,0,0,0,0


In [50]:
# Step 1: Create our X and y

y = Hollywood_Hills_East_df['Did it sell?']
X = Hollywood_Hills_East_df.drop(['Did it sell?'], 1)

In [51]:
APN = Hollywood_Hills_East_df.index.values
APN

array([5576001050, 5579039019, 5579039020, 5586011046, 5577010059,
       5577010018, 5577010104, 5577010031, 5582012017, 5577007150,
       5586014105, 5576001036, 5577007117, 5586009024, 5586014120,
       5576001063, 5586014075, 5576001057, 5586014053, 5586009021,
       5579039017, 5586008026, 5577010174, 5577010171, 5577010085,
       5586014052, 5577010030, 5577010161, 5577010007, 5577010143,
       5577010117, 5577010126, 5577010089, 5577010041, 5577010166,
       5577010162, 5577010111, 5577010082, 5586014033, 5585021027,
       5586008024, 5586011038, 5577010076, 5577010078, 5577010098,
       5577010006, 5577010013, 5577010052, 5577010081, 5577010107,
       5577010044, 5577010067, 5577010145, 5577010110, 5577010172,
       5577010004, 5577010025, 5577010034, 5577010086, 5577010105,
       5577010170, 5577010113, 5577010026, 5577010043, 5577010128,
       5577010002, 5577010119, 5577010093, 5577010056, 5577010066,
       5577010051, 5577010071, 5577010075, 5577010010, 5577010

In [52]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [53]:
X_pca

array([[-105126.80828907, -278840.11421518,  -55420.55483799],
       [-218662.05874191, -342901.96919614, -116744.18211355],
       [-218662.02653493, -342901.96716651, -116744.12760842],
       ...,
       [-283005.20107816, -366616.17004845, -170905.27991358],
       [-330165.30657467, -354191.87818628, -157078.74786146],
       [-224026.42799315, -307183.5119849 , -130474.4012219 ]])

In [54]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [55]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [56]:
X_train_scaled_APN = []

In [57]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({0: 315, 1: 315})

In [58]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [59]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [60]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.646


In [61]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6057389937106918

In [62]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.6825422804146206

In [63]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.67      0.54      0.76      0.60      0.37       106
          1       0.27      0.54      0.67      0.36      0.60      0.36        24

avg / total       0.76      0.65      0.57      0.68      0.60      0.37       130



In [64]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
119,0,0,5586011058
88,0,0,5577010109
129,0,0,5577007136
5,0,0,5577010130
23,0,0,5577010150
10,1,0,5577007161
85,0,0,5577007138
57,0,0,5586009019
68,1,0,5577010098
29,1,0,5577007145


In [65]:
Hollywood_Hills_East_Results_Condos_df = Hollywood_Hills_East_df.merge(results, how="left", right_on="PARCEL", left_on="PARCEL")
Hollywood_Hills_East_Results_Condos_df.head()

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY,Prediction,Actual
0,5576001050,0,0,0,0,1,233350,125784,107566,0,...,0,0,0,0,0,0,0,0,,
1,5579039019,0,0,1,0,0,131234,52489,78745,0,...,1,0,0,0,0,0,0,0,,
2,5579039020,0,0,1,0,0,131234,52489,78745,0,...,1,0,0,0,0,0,0,0,,
3,5586011046,0,0,1,0,1,576707,461578,115129,0,...,0,0,0,0,0,0,0,0,,
4,5577010059,0,0,1,0,1,355257,242680,112577,0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [66]:
Hollywood_Hills_East_Results_Condos_df

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,Other,PARTNERSHIP,QUIT CLAIM,RE-RECORD,TAX DEED,TRUST TRANSFER,TRUSTEES,WARRANTY,Prediction,Actual
0,5576001050,0,0,0,0,1,233350,125784,107566,0,...,0,0,0,0,0,0,0,0,,
1,5579039019,0,0,1,0,0,131234,52489,78745,0,...,1,0,0,0,0,0,0,0,,
2,5579039020,0,0,1,0,0,131234,52489,78745,0,...,1,0,0,0,0,0,0,0,,
3,5586011046,0,0,1,0,1,576707,461578,115129,0,...,0,0,0,0,0,0,0,0,,
4,5577010059,0,0,1,0,1,355257,242680,112577,0,...,0,0,0,0,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,5580002025,0,0,1,1,0,49631,10909,38722,0,...,0,0,0,0,0,0,0,0,,
513,5586014088,0,0,1,0,0,63132,12621,50511,0,...,0,0,0,0,0,0,0,0,,
514,5586014128,0,0,0,0,1,61890,10996,50894,0,...,0,0,0,0,0,0,0,0,,
515,5576017088,1,0,0,0,0,47799,15272,32527,0,...,0,0,1,0,0,0,0,0,,


In [67]:
Hollywood_Hills_East_Results_Condos_df.to_csv('Hollywood_Hills_East_Results_Condos.csv')