In [1]:
# import our dependencies

%matplotlib inline
from collections import Counter
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

## PCA + Oversampling (Random Oversampling) + Logistic Regression (SFRs)

In [2]:
# bring in our dataframe

Los_Feliz_df = pd.read_csv('Los_Feliz_Cleaned.csv', index_col='PARCEL')
Los_Feliz_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5590007001,0,0,4540 Ambrose Ave,0,0,1,764570,611657,152913,0,...,1,0,0,0,0,0,0,0,0,0
5591005003,0,0,2268 Ben Lomond Dr,0,0,1,963046,770437,192609,0,...,2,0,0,0,0,0,0,0,0,1
5592009038,1,0,3620 Amesbury Rd,0,0,1,1371383,906772,464611,0,...,3,0,0,0,0,0,0,0,0,0
5580029001,0,0,2530 Park Oak Ct,1,0,1,3939169,2954377,984792,0,...,5,0,0,0,0,0,0,0,0,0
5588011006,0,0,2814 Glendower Ave,0,0,1,1945117,1421350,523767,0,...,5,0,0,0,0,0,0,0,0,0


In [3]:
# drop any nulls

Los_Feliz_df = Los_Feliz_df.dropna()

In [4]:
len(Los_Feliz_df)

4411

In [5]:
# Change the datatype of these two columns into ints

Los_Feliz_df['TAXAMT'] = Los_Feliz_df['TAXAMT'].astype(int)
Los_Feliz_df['EFFYRBLT'] = Los_Feliz_df['EFFYRBLT'].astype(int)

In [6]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Los_Feliz_df = Los_Feliz_df[~Los_Feliz_df['SITEADDRESS'].str.contains('#')]
Los_Feliz_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5590007001,0,0,4540 Ambrose Ave,0,0,1,764570,611657,152913,0,...,1,0,0,0,0,0,0,0,0,0
5591005003,0,0,2268 Ben Lomond Dr,0,0,1,963046,770437,192609,0,...,2,0,0,0,0,0,0,0,0,1
5592009038,1,0,3620 Amesbury Rd,0,0,1,1371383,906772,464611,0,...,3,0,0,0,0,0,0,0,0,0


In [7]:
len(Los_Feliz_df)

3891

In [8]:
# Now drop the SiteAddress column
Los_Feliz_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [9]:
Los_Feliz_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5590007001,0,0,0,0,1,764570,611657,152913,0,0,...,1,0,0,0,0,0,0,0,0,0
5591005003,0,0,0,0,1,963046,770437,192609,0,0,...,2,0,0,0,0,0,0,0,0,1
5592009038,1,0,0,0,1,1371383,906772,464611,0,0,...,3,0,0,0,0,0,0,0,0,0


In [11]:
Los_Feliz_df.columns

Index(['Owned by Trust?', 'Owned by Business?', 'MAIL DIFFERENT FROM SITE?',
       'MAIL OUTSIDE CA?', 'TITLECO1', 'ASSDTOTAL', 'ASSDLAND', 'ASSDSTCT',
       'ASSDOTHR', 'EXEMPTCD', 'EXMPTAMT', 'PCNTIMPD', 'TAXAMT',
       'DOCDATE_YEAR', 'MULTIPARCEL', 'PRICE', 'LENDER1', 'LOANAMOUT1',
       'LOANTYPE1', 'YEARBLT', 'EFFYRBLT', 'LOTSQFT', 'TOTALSF', 'GRGTYPE',
       'BLDGMAT', 'TOTUNITS', 'QUALCLAS', 'BLDGCOND', 'NOSTORY', 'ROOFMAT',
       'FOUNDATN', 'BEDROOMS', 'BATHROOMS', 'FAMILYRM', 'DININGRM', 'POOL',
       'PATIO', 'FIREPLCE', 'AIRMTHOD', 'HEATMTHD', 'VIEW', 'Did it sell?'],
      dtype='object')

In [12]:
# Step 1: Create our X and y

y = Los_Feliz_df['Did it sell?']
X = Los_Feliz_df[['Owned by Trust?', 'Owned by Business?', 'MAIL DIFFERENT FROM SITE?',
       'MAIL OUTSIDE CA?', 'TITLECO1', 'ASSDTOTAL', 'ASSDLAND', 'ASSDSTCT',
       'ASSDOTHR', 'EXEMPTCD', 'EXMPTAMT', 'PCNTIMPD', 'TAXAMT',
       'DOCDATE_YEAR', 'MULTIPARCEL', 'PRICE', 'LENDER1', 'LOANAMOUT1',
       'LOANTYPE1', 'YEARBLT', 'EFFYRBLT', 'LOTSQFT', 'TOTALSF', 'GRGTYPE',
       'BLDGMAT', 'TOTUNITS', 'QUALCLAS', 'BLDGCOND', 'NOSTORY', 'ROOFMAT',
       'FOUNDATN', 'BEDROOMS', 'BATHROOMS', 'FAMILYRM', 'DININGRM', 'POOL',
       'PATIO', 'FIREPLCE', 'AIRMTHOD', 'HEATMTHD', 'VIEW']]

In [13]:
APN = Los_Feliz_df.index.values
APN

array([5590007001, 5591005003, 5592009038, ..., 5592018029, 5592021014,
       5592025013])

In [14]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [15]:
X_pca

array([[-364499.65419731,  230689.67518251, -349730.29782416],
       [ -81636.5547922 ,  110351.27682609, -444256.93675614],
       [ 452973.24544078,  -94041.53282624, -629493.1300123 ],
       ...,
       [-264378.97868709,  -29501.81913366,   87600.91671107],
       [-200272.55114188,  -46138.49595822,  123789.72552605],
       [-798576.66069953,  299350.4169688 ,   69495.46013528]])

In [16]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [17]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
X_train_scaled_APN = []

In [19]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({0: 2518, 1: 2518})

In [20]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [21]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [22]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.671


In [23]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6038533834586466

In [24]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.71866801860525

In [25]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.70      0.51      0.79      0.60      0.36       840
          1       0.21      0.51      0.70      0.30      0.60      0.35       133

avg / total       0.81      0.67      0.54      0.72      0.60      0.36       973



In [26]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
58,0,0,5587028019
0,0,0,5591012012
333,1,0,5588006018
247,1,0,5589019018
949,1,1,5591004024
220,1,0,5587001019
138,0,0,5430008009
2,1,0,5588010009
203,0,0,5587016009
919,0,0,5588029011


In [27]:
Los_Feliz_Results_SFRs_df = Los_Feliz_df.merge(results, how="left", right_on="PARCEL", left_on="PARCEL")
Los_Feliz_Results_SFRs_df.head()

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?,Prediction,Actual
0,5590007001,0,0,0,0,1,764570,611657,152913,0,...,0,0,0,0,0,0,0,0,0.0,0.0
1,5591005003,0,0,0,0,1,963046,770437,192609,0,...,0,0,0,0,0,0,0,1,,
2,5592009038,1,0,0,0,1,1371383,906772,464611,0,...,0,0,0,0,0,0,0,0,1.0,0.0
3,5580029001,0,0,1,0,1,3939169,2954377,984792,0,...,0,0,0,0,0,0,0,0,,
4,5588011006,0,0,0,0,1,1945117,1421350,523767,0,...,0,0,0,0,0,0,0,0,1.0,0.0


In [28]:
Los_Feliz_Results_SFRs_df

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?,Prediction,Actual
0,5590007001,0,0,0,0,1,764570,611657,152913,0,...,0,0,0,0,0,0,0,0,0.0,0.0
1,5591005003,0,0,0,0,1,963046,770437,192609,0,...,0,0,0,0,0,0,0,1,,
2,5592009038,1,0,0,0,1,1371383,906772,464611,0,...,0,0,0,0,0,0,0,0,1.0,0.0
3,5580029001,0,0,1,0,1,3939169,2954377,984792,0,...,0,0,0,0,0,0,0,0,,
4,5588011006,0,0,0,0,1,1945117,1421350,523767,0,...,0,0,0,0,0,0,0,0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3886,5592016027,0,0,1,0,1,1185261,602562,582699,0,...,0,0,0,0,0,0,0,0,,
3887,5592018027,0,0,1,0,1,2005946,1226940,779006,0,...,0,0,0,0,0,0,0,1,,
3888,5592018029,0,0,0,0,1,863391,674747,188644,0,...,0,0,0,0,0,0,0,0,,
3889,5592021014,0,0,0,0,1,949994,487345,462649,0,...,0,0,0,0,0,0,0,0,,


In [29]:
Los_Feliz_Results_SFRs_df.to_csv('Los_Feliz_Results_SFRs.csv')

# PCA + Oversampling (Random Oversampling) + Logistic Regression (Condos)

In [30]:
# bring in our dataframe

Los_Feliz_df = pd.read_csv('Los_Feliz_Cleaned.csv', index_col='PARCEL')
Los_Feliz_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5590007001,0,0,4540 Ambrose Ave,0,0,1,764570,611657,152913,0,...,1,0,0,0,0,0,0,0,0,0
5591005003,0,0,2268 Ben Lomond Dr,0,0,1,963046,770437,192609,0,...,2,0,0,0,0,0,0,0,0,1
5592009038,1,0,3620 Amesbury Rd,0,0,1,1371383,906772,464611,0,...,3,0,0,0,0,0,0,0,0,0
5580029001,0,0,2530 Park Oak Ct,1,0,1,3939169,2954377,984792,0,...,5,0,0,0,0,0,0,0,0,0
5588011006,0,0,2814 Glendower Ave,0,0,1,1945117,1421350,523767,0,...,5,0,0,0,0,0,0,0,0,0


In [31]:
# drop any nulls

Los_Feliz_df = Los_Feliz_df.dropna()

In [32]:
len(Los_Feliz_df)

4411

In [33]:
# Change the datatype of these two columns into ints

Los_Feliz_df['TAXAMT'] = Los_Feliz_df['TAXAMT'].astype(int)
Los_Feliz_df['EFFYRBLT'] = Los_Feliz_df['EFFYRBLT'].astype(int)

In [34]:
# Make sure our dataframe is only Condos

Los_Feliz_df = Los_Feliz_df[Los_Feliz_df['SITEADDRESS'].str.contains('#')]
Los_Feliz_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5588026045,0,0,4455 Los Feliz Blvd #404,1,0,1,205126,75168,129958,0,...,1,0,0,0,0,0,0,0,0,0
5588026048,1,0,4455 Los Feliz Blvd #407,1,0,1,364723,217939,146784,0,...,1,0,0,0,0,0,0,0,0,0
5588026066,0,0,4455 Los Feliz Blvd #701,0,0,1,324573,95459,229114,0,...,2,0,0,0,0,0,0,0,0,0


In [35]:
len(Los_Feliz_df)

520

In [36]:
# Now drop the SiteAddress column
Los_Feliz_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [37]:
Los_Feliz_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5588026045,0,0,1,0,1,205126,75168,129958,0,0,...,1,0,0,0,0,0,0,0,0,0
5588026048,1,0,1,0,1,364723,217939,146784,0,0,...,1,0,0,0,0,0,0,0,0,0
5588026066,0,0,0,0,1,324573,95459,229114,0,0,...,2,0,0,0,0,0,0,0,0,0


In [38]:
# Step 1: Create our X and y

y = Los_Feliz_df['Did it sell?']
X = Los_Feliz_df.drop(['Did it sell?'], 1)

In [39]:
APN = Los_Feliz_df.index.values
APN

array([5588026045, 5588026048, 5588026066, 5588026121, 5588026137,
       5590005059, 5590006028, 5590007083, 5590007087, 5590007092,
       5591009060, 5430023032, 5434002018, 5434002029, 5588026065,
       5588026069, 5588026093, 5588026099, 5588026108, 5588026140,
       5590005063, 5590012032, 5590013033, 5590013040, 5590013060,
       5590014066, 5592002063, 5592002076, 5430023034, 5433001031,
       5434007019, 5588026042, 5588026100, 5588026104, 5588026158,
       5588026172, 5588026175, 5588026193, 5590004027, 5590005042,
       5590005045, 5590005053, 5590005056, 5590013034, 5590013055,
       5590014052, 5592002062, 5592002075, 5592027033, 5430009026,
       5433001030, 5434002024, 5434002025, 5434002027, 5434002037,
       5588026067, 5588026086, 5588026134, 5588026199, 5590007091,
       5590012034, 5434002011, 5588026096, 5588026114, 5588026120,
       5588026185, 5588026202, 5588026204, 5588026205, 5588026211,
       5590007086, 5590010029, 5590014048, 5590014080, 5590014

In [40]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [41]:
X_pca

array([[-305830.93585641,   43845.96645458,  -10012.89906698],
       [-172323.33173953,  -73663.635231  , -134309.25728518],
       [-135673.76892604,   64485.02367885,  -11837.66032527],
       ...,
       [-404864.56662528,  -26713.48251125,   83872.48244118],
       [-404336.20464334,  -60777.18580287,  -22193.02486734],
       [-398244.41670452,   37033.26496127,   -1307.15631006]])

In [42]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [43]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [44]:
X_train_scaled_APN = []

In [45]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({0: 332, 1: 332})

In [46]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [47]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [48]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.577


In [49]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.555950687529635

In [50]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.6389743589743591

In [51]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.59      0.53      0.70      0.56      0.31       111
          1       0.18      0.53      0.59      0.27      0.56      0.31        19

avg / total       0.78      0.58      0.53      0.64      0.56      0.31       130



In [52]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
77,0,0,5590014079
95,0,0,5588026100
38,0,0,5590017032
120,0,0,5590007069
123,1,0,5589021030
82,1,0,5589021036
56,0,0,5433001034
22,1,0,5588026032
44,0,0,5588026120
35,0,0,5591009050


In [53]:
Los_Feliz_Condos_df = Los_Feliz_df.merge(results, how="left", right_on="PARCEL", left_on="PARCEL")
Los_Feliz_Condos_df.head()

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?,Prediction,Actual
0,5588026045,0,0,1,0,1,205126,75168,129958,0,...,0,0,0,0,0,0,0,0,,
1,5588026048,1,0,1,0,1,364723,217939,146784,0,...,0,0,0,0,0,0,0,0,,
2,5588026066,0,0,0,0,1,324573,95459,229114,0,...,0,0,0,0,0,0,0,0,,
3,5588026121,1,0,1,0,1,168774,33751,135023,0,...,0,0,0,0,0,0,0,0,0.0,0.0
4,5588026137,0,0,1,0,1,202377,65166,137211,0,...,0,0,0,0,0,0,0,1,,


In [54]:
Los_Feliz_Condos_df

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?,Prediction,Actual
0,5588026045,0,0,1,0,1,205126,75168,129958,0,...,0,0,0,0,0,0,0,0,,
1,5588026048,1,0,1,0,1,364723,217939,146784,0,...,0,0,0,0,0,0,0,0,,
2,5588026066,0,0,0,0,1,324573,95459,229114,0,...,0,0,0,0,0,0,0,0,,
3,5588026121,1,0,1,0,1,168774,33751,135023,0,...,0,0,0,0,0,0,0,0,0.0,0.0
4,5588026137,0,0,1,0,1,202377,65166,137211,0,...,0,0,0,0,0,0,0,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,5591009056,0,0,0,0,1,475609,346319,129290,0,...,0,0,0,0,0,0,0,0,,
516,5592002051,0,0,0,0,1,115535,44784,70751,0,...,0,0,0,0,0,0,0,0,,
517,5592002058,0,0,0,0,0,99454,71851,27603,0,...,0,0,0,0,0,0,0,1,,
518,5592002061,0,0,0,0,0,194738,57119,137619,0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [55]:
Los_Feliz_Condos_df.to_csv('Los_Feliz_Results_Condos.csv')