In [229]:
# import our dependencies

%matplotlib inline
from collections import Counter
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import numpy as np
import os

## PCA + Oversampling (Random Oversampling) + Logistic Regression (SFRs)

In [230]:
file = 'Hancock_Park_Cleaned.csv'

location = '/Users/Admin/Desktop/GW_Bootcamp/Analysis_Projects/Final_Project_Team_1/'

In [231]:
path = os.path.join(location, file)

In [232]:
Hancock_Park_df = pd.read_csv(path, index_col='PARCEL')
Hancock_Park_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,0,...,3,0,0,0,0,0,0,0,0,1
5513010003,1,0,164 N Las Palmas Ave,0,0,0,2518796,1609956,908840,0,...,6,0,0,0,0,0,0,0,0,0
5515012030,1,0,108 S Rossmore Ave,1,0,0,1428176,999727,428449,0,...,7,0,0,0,0,0,0,0,0,0


In [233]:
Hancock_Park_df['Owned by Trust?'].dtype

dtype('int64')

In [234]:
# drop any nulls

Hancock_Park_df = Hancock_Park_df.dropna()

In [235]:
len(Hancock_Park_df)

5530

In [236]:
# Change the datatype of these two columns into ints

Hancock_Park_df['TAXAMT'] = Hancock_Park_df['TAXAMT'].astype(int)
Hancock_Park_df['EFFYRBLT'] = Hancock_Park_df['EFFYRBLT'].astype(int)

In [237]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Hancock_Park_df = Hancock_Park_df[~Hancock_Park_df['SITEADDRESS'].str.contains('#')]
Hancock_Park_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,0,...,3,0,0,0,0,0,0,0,0,1


In [238]:
len(Hancock_Park_df)

5525

In [239]:
# Now drop the SiteAddress column
Hancock_Park_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [240]:
Hancock_Park_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,1,0,0,2054958,1583885,471073,0,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,0,0,0,915420,592174,323246,0,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,0,0,0,1304706,731909,572797,0,0,...,3,0,0,0,0,0,0,0,0,1


In [241]:
# Hancock_Park_df['PARCEL_1'] = Hancock_Park_df.index.values
# Hancock_Park_df.head()

In [242]:
# Hancock_Park_df = pd.DataFrame(Hancock_Park_df, columns = ['PARCEL_1',
#                                                            'ASSDLAND',
#                                                            'LOTSQFT',
#                                                            'TOTALSF',
#                                                            'ASSDSTCT',
#                                                            'PRICE',
#                                                            'DOCDATE_YEAR',
#                                                            'LOANAMOUT1',
#                                                            'ASSDTOTAL',
#                                                            'TAXAMT',
#                                                            'EFFYRBLT',
#                                                            'YEARBLT',
#                                                            'BEDROOMS',
#                                                            'BATHROOMS',
#                                                            'Owned by Trust?',
#                                                            'MAIL DIFFERENT FROM SITE?',
#                                                            'Did it sell?'])

In [243]:
Hancock_Park_df

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,1,0,0,2054958,1583885,471073,0,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,0,0,0,915420,592174,323246,0,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,0,0,0,1304706,731909,572797,0,0,...,3,0,0,0,0,0,0,0,0,1
5513010003,1,0,0,0,0,2518796,1609956,908840,0,0,...,6,0,0,0,0,0,0,0,0,0
5515012030,1,0,1,0,0,1428176,999727,428449,0,0,...,7,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5513014003,0,0,0,0,0,688635,550912,137723,0,0,...,2,0,0,0,0,0,0,0,0,0
5084006021,0,0,0,0,0,589308,280085,309223,0,0,...,3,0,0,0,0,0,0,0,0,0
5507014003,0,0,1,0,0,609094,369478,239616,0,0,...,2,0,0,0,0,0,0,0,0,1
5507019023,0,0,0,0,0,638147,459267,178880,0,0,...,2,0,0,0,0,0,0,0,0,0


In [244]:
# Step 1: Create our X and y

y = Hancock_Park_df['Did it sell?']
X = Hancock_Park_df.drop(['Did it sell?'], 1)

In [245]:
APN = Hancock_Park_df.index.values
APN

array([5505026001, 5515028023, 5515022015, ..., 5507014003, 5507019023,
       5524038012])

In [246]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [247]:
X_pca

array([[ 1.05678823e+06, -5.40749780e+05, -7.99150429e+05],
       [-4.37316125e+05, -8.79156495e+04,  8.48396223e+04],
       [ 1.99411886e+05, -1.17643013e+05,  1.46161978e+05],
       ...,
       [-8.90418298e+05,  3.19670308e+04,  2.45564591e+04],
       [-8.40390671e+05,  3.38917753e+01,  2.46831758e+04],
       [-6.13625000e+05, -5.46815384e+03,  5.63046539e+04]])

In [248]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [249]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [250]:
X_train_scaled_APN = []

In [251]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({0: 3720, 1: 3720})

In [252]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [253]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [254]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.687


In [255]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5587749527091512

In [257]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[894, 347],
       [ 85,  56]])

In [260]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.7442384369557266

In [143]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.72      0.40      0.81      0.53      0.30      1241
          1       0.14      0.40      0.72      0.21      0.53      0.28       141

avg / total       0.83      0.69      0.43      0.74      0.53      0.29      1382



In [144]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results.sample(20)

Unnamed: 0,Prediction,Actual,PARCEL
395,1,1,5515019007
15,0,0,5523035006
1015,0,0,5081027013
817,0,0,5523021020
778,0,0,5524033020
96,1,0,5507030012
131,1,0,5516005017
423,1,0,5505019006
342,0,0,5515020022
1343,1,0,5507014014


In [145]:
Hancock_Park_Results_SFRs_df = Hancock_Park_df.merge(results, how="left", right_on="PARCEL", left_on="PARCEL")
Hancock_Park_Results_SFRs_df.head()

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?,Prediction,Actual
0,5505026001,1,0,1,0,0,2054958,1583885,471073,0,...,0,0,0,0,0,0,0,0,,
1,5515028023,1,0,0,0,0,915420,592174,323246,0,...,0,0,0,0,0,0,0,0,0.0,0.0
2,5515022015,1,0,0,0,0,1304706,731909,572797,0,...,0,0,0,0,0,0,0,1,,
3,5513010003,1,0,0,0,0,2518796,1609956,908840,0,...,0,0,0,0,0,0,0,0,,
4,5515012030,1,0,1,0,0,1428176,999727,428449,0,...,0,0,0,0,0,0,0,0,,


In [147]:
Hancock_Park_Results_SFRs_df = pd.DataFrame(Hancock_Park_Results_SFRs_df, columns = ['PARCEL', 'Did it sell?', 'Prediction', 'Actual'])
Hancock_Park_Results_SFRs_df.head()

Unnamed: 0,PARCEL,Did it sell?,Prediction,Actual
0,5505026001,0,,
1,5515028023,0,0.0,0.0
2,5515022015,1,,
3,5513010003,0,,
4,5515012030,0,,


In [148]:
Hancock_Park_Results_SFRs_df.to_csv('Hancock_Park_Results_SFRs.csv')

# Same as above but import the 3 additional columns

In [290]:
file = 'Hancock_Park_Cleaned.csv'

location = '/Users/Admin/Desktop/GW_Bootcamp/Analysis_Projects/Final_Project_Team_1/'

In [291]:
path = os.path.join(location, file)

In [292]:
Hancock_Park_df = pd.read_csv(path, index_col='PARCEL')
Hancock_Park_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,0,...,3,0,0,0,0,0,0,0,0,1
5513010003,1,0,164 N Las Palmas Ave,0,0,0,2518796,1609956,908840,0,...,6,0,0,0,0,0,0,0,0,0
5515012030,1,0,108 S Rossmore Ave,1,0,0,1428176,999727,428449,0,...,7,0,0,0,0,0,0,0,0,0


In [293]:
# drop any nulls

Hancock_Park_df = Hancock_Park_df.dropna()

In [294]:
# Change the datatype of these two columns into ints

Hancock_Park_df['TAXAMT'] = Hancock_Park_df['TAXAMT'].astype(int)
Hancock_Park_df['EFFYRBLT'] = Hancock_Park_df['EFFYRBLT'].astype(int)

In [295]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Hancock_Park_df = Hancock_Park_df[~Hancock_Park_df['SITEADDRESS'].str.contains('#')]
Hancock_Park_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,0,...,3,0,0,0,0,0,0,0,0,1


In [296]:
Merging_df = pd.read_csv('Hancock_Park_For_Merging.csv')
Merging_df

Unnamed: 0,PARCEL,Site Number,SITUS ZIP CODE,TRACT
0,5516021016,140,90004,1
1,5517003008,106,90004,1
2,5516021011,112,90004,1
3,5516018005,123,90004,1
4,5517003014,138,90004,1
...,...,...,...,...
6322,5504026021,504,90020,
6323,5504021003,451,90020,
6324,5504023015,324,90020,
6325,5504016047,531,90020,


In [297]:
Hancock_Park_df = Hancock_Park_df.merge(Merging_df, how="left", right_on="PARCEL", left_on="PARCEL")
Hancock_Park_df.head()

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,...,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?,Site Number,SITUS ZIP CODE,TRACT
0,5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,...,0,0,0,0,0,0,0,354,90004,1390
1,5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,...,0,0,0,0,0,0,0,143,90004,3743
2,5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,...,0,0,0,0,0,0,1,253,90004,3743
3,5513010003,1,0,164 N Las Palmas Ave,0,0,0,2518796,1609956,908840,...,0,0,0,0,0,0,0,164,90004,8320
4,5515012030,1,0,108 S Rossmore Ave,1,0,0,1428176,999727,428449,...,0,0,0,0,0,0,0,108,90004,4179


In [301]:
Hancock_Park_df.drop(['SITEADDRESS'], axis = 1, inplace=True)
Hancock_Park_df

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?,Site Number,SITUS ZIP CODE,TRACT
0,5505026001,1,0,1,0,0,2054958,1583885,471073,0,...,0,0,0,0,0,0,0,354,90004,1390
1,5515028023,1,0,0,0,0,915420,592174,323246,0,...,0,0,0,0,0,0,0,143,90004,3743
2,5515022015,1,0,0,0,0,1304706,731909,572797,0,...,0,0,0,0,0,0,1,253,90004,3743
3,5513010003,1,0,0,0,0,2518796,1609956,908840,0,...,0,0,0,0,0,0,0,164,90004,8320
4,5515012030,1,0,1,0,0,1428176,999727,428449,0,...,0,0,0,0,0,0,0,108,90004,4179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5520,5513014003,0,0,0,0,0,688635,550912,137723,0,...,0,0,0,0,0,0,0,110,90036,8498
5521,5084006021,0,0,0,0,0,589308,280085,309223,0,...,0,0,0,0,0,0,0,903,90036,5180
5522,5507014003,0,0,1,0,0,609094,369478,239616,0,...,0,0,0,0,0,0,1,414,90036,6388
5523,5507019023,0,0,0,0,0,638147,459267,178880,0,...,0,0,0,0,0,0,0,624,90036,5049


In [327]:
sum(Hancock_Park_df['TRACT'] == '46O4')

3

In [328]:
Hancock_Park_df['TRACT'] = Hancock_Park_df['TRACT'].replace({"46O4" : 4604})

In [329]:
Hancock_Park_df['TRACT'].isna().sum()

0

In [330]:
Hancock_Park_df['TRACT'] = Hancock_Park_df['TRACT'].fillna(0)

In [332]:
Hancock_Park_df['TRACT'] = Hancock_Park_df['TRACT'].astype(int)

In [333]:
# Step 1: Create our X and y's

y = Hancock_Park_df['Did it sell?']

X = Hancock_Park_df.drop(['Did it sell?'], 1)

In [334]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [335]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [336]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [337]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({0: 3720, 1: 3720})

In [338]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [339]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [340]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.428


In [341]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6247278275927101

In [342]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.5108871034072386

In [343]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[468, 773],
       [ 18, 123]])

# Same as above but with feature reduction

In [176]:
file = 'Hancock_Park_Cleaned.csv'

location = '/Users/Admin/Desktop/GW_Bootcamp/Analysis_Projects/Final_Project_Team_1/'

In [177]:
path = os.path.join(location, file)

In [178]:
Hancock_Park_df = pd.read_csv(path, index_col='PARCEL')
Hancock_Park_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,0,...,3,0,0,0,0,0,0,0,0,1
5513010003,1,0,164 N Las Palmas Ave,0,0,0,2518796,1609956,908840,0,...,6,0,0,0,0,0,0,0,0,0
5515012030,1,0,108 S Rossmore Ave,1,0,0,1428176,999727,428449,0,...,7,0,0,0,0,0,0,0,0,0


In [179]:
# drop any nulls

Hancock_Park_df = Hancock_Park_df.dropna()

In [180]:
# Change the datatype of these two columns into ints

Hancock_Park_df['TAXAMT'] = Hancock_Park_df['TAXAMT'].astype(int)
Hancock_Park_df['EFFYRBLT'] = Hancock_Park_df['EFFYRBLT'].astype(int)

In [181]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Hancock_Park_df = Hancock_Park_df[~Hancock_Park_df['SITEADDRESS'].str.contains('#')]
Hancock_Park_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,0,...,3,0,0,0,0,0,0,0,0,1


In [182]:
# Now drop the SiteAddress column
Hancock_Park_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [183]:
Hancock_Park_df['PARCEL_1'] = Hancock_Park_df.index.values
Hancock_Park_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?,PARCEL_1
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,1,0,0,2054958,1583885,471073,0,0,...,0,0,0,0,0,0,0,0,0,5505026001
5515028023,1,0,0,0,0,915420,592174,323246,0,0,...,0,0,0,0,0,0,0,0,0,5515028023
5515022015,1,0,0,0,0,1304706,731909,572797,0,0,...,0,0,0,0,0,0,0,0,1,5515022015
5513010003,1,0,0,0,0,2518796,1609956,908840,0,0,...,0,0,0,0,0,0,0,0,0,5513010003
5515012030,1,0,1,0,0,1428176,999727,428449,0,0,...,0,0,0,0,0,0,0,0,0,5515012030


In [187]:
# Create our X and y
y = Hancock_Park_df['Did it sell?']

X = pd.DataFrame(Hancock_Park_df, columns = ['PARCEL_1', 
                               'ASSDLAND', 
                               'LOTSQFT',
                               'TOTALSF',
                               'ASSDSTCT',
                               'PRICE',
                               'ASSDTOTAL',
                               'DOCDATE_YEAR',
                               'LOANAMOUT1',
                               'TAXAMT',
                               'EFFYRBLT',
                               'YEARBLT',
                               'BEDROOMS',
                               'BATHROOMS',
                               'Owned by Trust?', 
                               'MAIL DIFFERENT FROM SITE?',
                               ])

In [189]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [190]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [191]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [192]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({0: 3720, 1: 3720})

In [193]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [194]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [195]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.428


In [196]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6247278275927101

In [197]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.5108871034072386

# PCA + Oversampling (Random Oversampling) + Logistic Regression (Condos)

In [149]:
# bring in our dataframe

Hancock_Park_df = pd.read_csv(path, index_col='PARCEL')
Hancock_Park_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,0,...,3,0,0,0,0,0,0,0,0,1
5513010003,1,0,164 N Las Palmas Ave,0,0,0,2518796,1609956,908840,0,...,6,0,0,0,0,0,0,0,0,0
5515012030,1,0,108 S Rossmore Ave,1,0,0,1428176,999727,428449,0,...,7,0,0,0,0,0,0,0,0,0


In [150]:
# drop any nulls

Hancock_Park_df = Hancock_Park_df.dropna()

In [151]:
len(Hancock_Park_df)

5530

In [152]:
# Change the datatype of these two columns into ints

Hancock_Park_df['TAXAMT'] = Hancock_Park_df['TAXAMT'].astype(int)
Hancock_Park_df['EFFYRBLT'] = Hancock_Park_df['EFFYRBLT'].astype(int)

In [153]:
# Make sure our dataframe is only Condos

Hancock_Park_df = Hancock_Park_df[Hancock_Park_df['SITEADDRESS'].str.contains('#')]
Hancock_Park_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505008010,1,0,546 S Rimpau Blvd #2200,0,0,0,3491436,2450134,1041302,0,...,6,0,0,0,0,0,0,0,0,0
5515029015,1,0,169 S Plymouth Blvd #100,0,0,0,3122716,2402093,720623,0,...,4,0,0,0,0,0,0,0,0,0
5522008053,0,0,5114 Melrose Ave #8,0,0,0,790127,548843,241284,0,...,3,0,0,0,0,0,0,0,0,1


In [154]:
len(Hancock_Park_df)

5

In [155]:
# Now drop the SiteAddress column
Hancock_Park_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [156]:
Hancock_Park_df.head(3)

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505008010,1,0,0,0,0,3491436,2450134,1041302,0,0,...,6,0,0,0,0,0,0,0,0,0
5515029015,1,0,0,0,0,3122716,2402093,720623,0,0,...,4,0,0,0,0,0,0,0,0,0
5522008053,0,0,0,0,0,790127,548843,241284,0,0,...,3,0,0,0,0,0,0,0,0,1


In [157]:
# Step 1: Create our X and y

y = Hancock_Park_df['Did it sell?']
X = Hancock_Park_df.drop(['Did it sell?'], 1)

In [158]:
APN = Hancock_Park_df.index.values
APN

array([5505008010, 5515029015, 5522008053, 5522008048, 5516014005])

In [159]:
# Step 2: Use PCA to reduce dimension to three principal components.
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [160]:
X_pca

array([[ 2685213.96624914,   487711.4316359 ,   -18072.9730726 ],
       [ 2436634.88899754,  -457406.79477098,    50496.47587626],
       [-1276644.47457764,  -114043.18418837,   -65881.3034359 ],
       [-1315693.5260042 ,  -106153.72398376,   -60953.82359048],
       [-2529510.85466484,   189892.27130722,    94411.62422273]])

In [161]:
# Step 3: Train, test, split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=78, stratify=y)

In [162]:
# Step 4: Scale our data

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit our scaler, named 'scaler' to our data, which produces a new StandardScaler object
# which we call 'X_scaler'
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [163]:
X_train_scaled_APN = []

In [164]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_random_oversampled, y_random_oversampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_random_oversampled)

Counter({1: 2, 0: 2})

In [165]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', 
                                max_iter=300,
                                random_state=78,
                                class_weight="balanced")

model.fit(X_random_oversampled, y_random_oversampled)

LogisticRegression(class_weight='balanced', max_iter=300, random_state=78)

In [166]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

In [167]:
print(f" Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}")

 Logistic regression model accuracy: 0.500


In [168]:
# just for shits n' gigs, lets find the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5

In [169]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="weighted")

0.3333333333333333

In [170]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00         1
          1       0.50      1.00      0.00      0.67      0.00      0.00         1

avg / total       0.25      0.50      0.50      0.33      0.00      0.00         2



  _warn_prf(average, modifier, msg_start, len(result))


In [171]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test, "PARCEL": y_test.index.values}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual,PARCEL
0,1,1,5522008048
1,1,0,5515029015


In [172]:
Hancock_Park_Results_Condos_df = Hancock_Park_df.merge(results, how="left", right_on="PARCEL", left_on="PARCEL")
Hancock_Park_Results_Condos_df.head()

Unnamed: 0,PARCEL,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?,Prediction,Actual
0,5505008010,1,0,0,0,0,3491436,2450134,1041302,0,...,0,0,0,0,0,0,0,0,,
1,5515029015,1,0,0,0,0,3122716,2402093,720623,0,...,0,0,0,0,0,0,0,0,1.0,0.0
2,5522008053,0,0,0,0,0,790127,548843,241284,0,...,0,0,0,0,0,0,0,1,,
3,5522008048,0,0,0,0,0,767344,533310,234034,0,...,0,0,0,0,0,0,0,1,1.0,1.0
4,5516014005,1,0,0,0,0,83813,43702,40111,0,...,0,0,0,0,0,0,0,0,,


In [174]:
Hancock_Park_Results_Condos_df = pd.DataFrame(Hancock_Park_Results_Condos_df, columns = ['PARCEL', 'Did it sell?', 'Prediction', 'Actual'])
Hancock_Park_Results_Condos_df.head()

Unnamed: 0,PARCEL,Did it sell?,Prediction,Actual
0,5505008010,0,,
1,5515029015,0,1.0,0.0
2,5522008053,1,,
3,5522008048,1,1.0,1.0
4,5516014005,0,,


In [175]:
Hancock_Park_Results_Condos_df.to_csv('Hancock_Park_Results_Condos.csv')

In [261]:
# Now find the discrepencies

In [262]:
file = 'Hancock_Park_Cleaned.csv'

location = '/Users/Admin/Desktop/GW_Bootcamp/Analysis_Projects/Final_Project_Team_1/'

In [263]:
path = os.path.join(location, file)

In [264]:
Hancock_Park_df = pd.read_csv(path, index_col='PARCEL')
Hancock_Park_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,0,...,3,0,0,0,0,0,0,0,0,1
5513010003,1,0,164 N Las Palmas Ave,0,0,0,2518796,1609956,908840,0,...,6,0,0,0,0,0,0,0,0,0
5515012030,1,0,108 S Rossmore Ave,1,0,0,1428176,999727,428449,0,...,7,0,0,0,0,0,0,0,0,0


In [265]:
# drop any nulls

Hancock_Park_df = Hancock_Park_df.dropna()

In [266]:
# Change the datatype of these two columns into ints

Hancock_Park_df['TAXAMT'] = Hancock_Park_df['TAXAMT'].astype(int)
Hancock_Park_df['EFFYRBLT'] = Hancock_Park_df['EFFYRBLT'].astype(int)

In [267]:
# Make sure our dataframe is only SFR's so exclude every property that has an '#' in its site address

Hancock_Park_df = Hancock_Park_df[~Hancock_Park_df['SITEADDRESS'].str.contains('#')]

In [268]:
# Now drop the SiteAddress column
Hancock_Park_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [269]:
# Now split the sellers from the non-sellers:

Hancock_Park_Sellers_df = Hancock_Park_df[Hancock_Park_df['Did it sell?'] == 1]
Hancock_Park_Sellers_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5515022015,1,0,0,0,0,1304706,731909,572797,0,0,...,3,0,0,0,0,0,0,0,0,1
5516023003,1,0,0,0,0,582687,466154,116533,0,0,...,3,0,0,0,0,0,0,0,0,1
5515023004,0,0,0,0,0,732290,518993,213297,0,0,...,2,0,0,0,0,0,0,0,0,1
5523001002,0,0,0,0,0,950936,615024,335912,0,0,...,3,0,0,0,0,0,0,0,0,1
5515012004,0,0,0,0,0,718038,545610,172428,0,0,...,2,0,0,0,0,0,0,0,0,1


In [270]:
Hancock_Park_NonSellers_df = Hancock_Park_df[Hancock_Park_df['Did it sell?'] == 0]
Hancock_Park_NonSellers_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,1,0,0,2054958,1583885,471073,0,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,0,0,0,915420,592174,323246,0,0,...,2,0,0,0,0,0,0,0,0,0
5513010003,1,0,0,0,0,2518796,1609956,908840,0,0,...,6,0,0,0,0,0,0,0,0,0
5515012030,1,0,1,0,0,1428176,999727,428449,0,0,...,7,0,0,0,0,0,0,0,0,0
5516002020,1,0,0,0,0,1167489,689700,477789,0,0,...,3,0,0,0,0,0,0,0,0,0


In [271]:
# Mail different from Site

sum(Hancock_Park_Sellers_df['MAIL DIFFERENT FROM SITE?'] == 1) / len(Hancock_Park_Sellers_df['MAIL DIFFERENT FROM SITE?']) * 100

29.078014184397162

In [272]:
sum(Hancock_Park_NonSellers_df['MAIL DIFFERENT FROM SITE?'] == 1) / len(Hancock_Park_NonSellers_df['MAIL DIFFERENT FROM SITE?']) * 100

19.693610159242088

In [273]:
# Mail outside California

sum(Hancock_Park_Sellers_df['MAIL OUTSIDE CA?'] == 1) / len(Hancock_Park_Sellers_df['MAIL OUTSIDE CA?']) * 100

2.8368794326241136

In [274]:
sum(Hancock_Park_NonSellers_df['MAIL OUTSIDE CA?'] == 1) / len(Hancock_Park_NonSellers_df['MAIL OUTSIDE CA?']) * 100

1.3706913928643418

In [275]:
# House size

Hancock_Park_Sellers_df['TOTALSF'].mean()

2943.301418439716

In [276]:
Hancock_Park_NonSellers_df['TOTALSF'].mean()

2787.9592824027413

In [277]:
# Assessed Value

Hancock_Park_Sellers_df['ASSDTOTAL'].mean()

1406128.285460993

In [278]:
Hancock_Park_NonSellers_df['ASSDTOTAL'].mean()

1070741.8482160855

In [280]:
# Avg Year Built

Hancock_Park_Sellers_df['YEARBLT'].mean()

1924.4840425531916

In [281]:
Hancock_Park_NonSellers_df['YEARBLT'].mean()

1925.330981656924

In [282]:
# Avg Last Sale Price

Hancock_Park_Sellers_df['PRICE'][Hancock_Park_Sellers_df['PRICE'] != 0].mean()

1460061.2300380229

In [283]:
Hancock_Park_NonSellers_df['PRICE'][Hancock_Park_NonSellers_df['PRICE'] != 0].mean()

1035947.3683637946

In [284]:
# Avg Purchase Year

Hancock_Park_Sellers_df['DOCDATE_YEAR'][Hancock_Park_Sellers_df['DOCDATE_YEAR'] != 0].mean()

2003.813829787234

In [285]:
Hancock_Park_NonSellers_df['DOCDATE_YEAR'][Hancock_Park_NonSellers_df['DOCDATE_YEAR'] != 0].mean()

2000.6500705502922

In [286]:
# Avg Tax Amount

Hancock_Park_Sellers_df['TAXAMT'][Hancock_Park_Sellers_df['TAXAMT'] != 0].mean()

17151.421985815603

In [287]:
Hancock_Park_NonSellers_df['TAXAMT'][Hancock_Park_NonSellers_df['TAXAMT'] != 0].mean()

13084.848014513203

In [288]:
# Avg Loan Amount

Hancock_Park_Sellers_df['LOANAMOUT1'][Hancock_Park_Sellers_df['LOANAMOUT1'] != 0].mean()

1020562.1130653266

In [289]:
Hancock_Park_NonSellers_df['LOANAMOUT1'][Hancock_Park_NonSellers_df['LOANAMOUT1'] != 0].mean()

732337.374854482

# Now do the same for Condos

In [362]:
file = 'Hancock_Park_Cleaned.csv'

location = '/Users/Admin/Desktop/GW_Bootcamp/Analysis_Projects/Final_Project_Team_1/'

In [363]:
path = os.path.join(location, file)

In [364]:
Hancock_Park_df = pd.read_csv(path, index_col='PARCEL')
Hancock_Park_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,SITEADDRESS,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505026001,1,0,354 N Plymouth Blvd,1,0,0,2054958,1583885,471073,0,...,5,0,0,0,0,0,0,0,0,0
5515028023,1,0,143 N Plymouth Blvd,0,0,0,915420,592174,323246,0,...,2,0,0,0,0,0,0,0,0,0
5515022015,1,0,253 S Plymouth Blvd,0,0,0,1304706,731909,572797,0,...,3,0,0,0,0,0,0,0,0,1
5513010003,1,0,164 N Las Palmas Ave,0,0,0,2518796,1609956,908840,0,...,6,0,0,0,0,0,0,0,0,0
5515012030,1,0,108 S Rossmore Ave,1,0,0,1428176,999727,428449,0,...,7,0,0,0,0,0,0,0,0,0


In [365]:
# drop any nulls

Hancock_Park_df = Hancock_Park_df.dropna()

In [366]:
# Change the datatype of these two columns into ints

Hancock_Park_df['TAXAMT'] = Hancock_Park_df['TAXAMT'].astype(int)
Hancock_Park_df['EFFYRBLT'] = Hancock_Park_df['EFFYRBLT'].astype(int)

In [367]:
# Make sure our dataframe is only condos

Hancock_Park_df = Hancock_Park_df[Hancock_Park_df['SITEADDRESS'].str.contains('#')]

In [368]:
# Now drop the SiteAddress column
Hancock_Park_df.drop(['SITEADDRESS'], axis=1, inplace=True)

In [369]:
# Now split the sellers from the non-sellers:

Hancock_Park_Sellers_df = Hancock_Park_df[Hancock_Park_df['Did it sell?'] == 1]
Hancock_Park_Sellers_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5522008053,0,0,0,0,0,790127,548843,241284,0,0,...,3,0,0,0,0,0,0,0,0,1
5522008048,0,0,0,0,0,767344,533310,234034,0,0,...,3,0,0,0,0,0,0,0,0,1


In [370]:
Hancock_Park_NonSellers_df = Hancock_Park_df[Hancock_Park_df['Did it sell?'] == 0]
Hancock_Park_NonSellers_df.head()

Unnamed: 0_level_0,Owned by Trust?,Owned by Business?,MAIL DIFFERENT FROM SITE?,MAIL OUTSIDE CA?,TITLECO1,ASSDTOTAL,ASSDLAND,ASSDSTCT,ASSDOTHR,EXEMPTCD,...,BATHROOMS,FAMILYRM,DININGRM,POOL,PATIO,FIREPLCE,AIRMTHOD,HEATMTHD,VIEW,Did it sell?
PARCEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5505008010,1,0,0,0,0,3491436,2450134,1041302,0,0,...,6,0,0,0,0,0,0,0,0,0
5515029015,1,0,0,0,0,3122716,2402093,720623,0,0,...,4,0,0,0,0,0,0,0,0,0
5516014005,1,0,0,0,0,83813,43702,40111,0,0,...,5,0,0,0,0,0,0,0,0,0


In [371]:
# Mail different from Site

sum(Hancock_Park_Sellers_df['MAIL DIFFERENT FROM SITE?'] == 1) / len(Hancock_Park_Sellers_df['MAIL DIFFERENT FROM SITE?']) * 100

0.0

In [372]:
sum(Hancock_Park_NonSellers_df['MAIL DIFFERENT FROM SITE?'] == 1) / len(Hancock_Park_NonSellers_df['MAIL DIFFERENT FROM SITE?']) * 100

0.0

In [374]:
# Mail outside California

sum(Hancock_Park_Sellers_df['MAIL OUTSIDE CA?'] == 1) / len(Hancock_Park_Sellers_df['MAIL OUTSIDE CA?']) * 100

0.0

In [375]:
sum(Hancock_Park_NonSellers_df['MAIL OUTSIDE CA?'] == 1) / len(Hancock_Park_NonSellers_df['MAIL OUTSIDE CA?']) * 100

0.0

In [376]:
# House size

Hancock_Park_Sellers_df['TOTALSF'].mean()

1520.0