# Preparation MVP

In [1]:
import acquire as ac
import prepare as prep

import pandas as pd
import numpy as np

In [2]:
zil = ac.zillow_data()

In [3]:
rename = {'bedroomcnt': 'bedrooms',  # Create a dictionary for new column names
              'bathroomcnt': 'bathrooms',
              'calculatedfinishedsquarefeet': 'sq_ft',
              'taxvaluedollarcnt': 'price'}
zil = zil.rename(columns=rename)  # Rename colums using dictionary

In [4]:
zil.bedrooms.value_counts()

3.0     25015
4.0     16582
2.0      8739
5.0      3974
6.0       661
1.0       587
0.0       133
7.0       111
8.0        33
9.0         8
25.0        1
10.0        1
11.0        1
Name: bedrooms, dtype: int64

In [5]:
bed_bins = pd.cut(zil.bedrooms, bins=[-0.5,.5,1.5,2.5,3.5,4.5,5.5,1000], labels=['0', '1', '2', '3', '4', '5', '6+'])

In [6]:
zil.bedrooms = bed_bins

In [7]:
zil.bedrooms.value_counts()

3     25015
4     16582
2      8739
5      3974
6+      816
1       587
0       133
Name: bedrooms, dtype: int64

In [8]:
zil.bathrooms.value_counts()

2.0     24532
3.0     11152
1.0     10626
2.5      3723
4.0      2147
1.5       793
5.0       780
3.5       736
4.5       506
6.0       303
5.5       165
0.0       140
7.0       116
8.0        37
6.5        30
9.0        26
7.5        11
10.0       11
8.5         5
11.0        3
13.0        1
9.5         1
20.0        1
19.5        1
Name: bathrooms, dtype: int64

In [9]:
np.where(zil.bathrooms > 6, '7+',zil.bathrooms.astype(str))

array(['2.0', '4.0', '2.0', ..., '3.0', '3.0', '2.0'], dtype=object)

In [10]:
trash = zil.copy()
zil.bathrooms = np.where(zil.bathrooms > 5, '6+',zil.bathrooms.astype(str))

In [11]:
trash.bathrooms.value_counts()

2.0     24532
3.0     11152
1.0     10626
2.5      3723
4.0      2147
1.5       793
5.0       780
3.5       736
4.5       506
6.0       303
5.5       165
0.0       140
7.0       116
8.0        37
6.5        30
9.0        26
7.5        11
10.0       11
8.5         5
11.0        3
13.0        1
9.5         1
20.0        1
19.5        1
Name: bathrooms, dtype: int64

In [12]:
zil.head()

Unnamed: 0,bedrooms,bathrooms,sq_ft,price
0,4,2.0,3633.0,296425.0
1,3,4.0,1620.0,847770.0
2,3,2.0,2077.0,646760.0
3,0,0.0,1200.0,5328.0
4,0,0.0,171.0,6920.0


In [13]:
zil.price.sort_values(ascending=False)[:20]

3289     30166843.0
10554    22843125.0
24592    19129816.0
13920    18782125.0
35403    17170000.0
35324    14721125.0
7821     14563303.0
13918    13243244.0
46198    12698505.0
25452    12228491.0
20168    11962714.0
24685    11904874.0
31982    11689668.0
29174    11517922.0
21385    11378863.0
13919    10904781.0
30828    10358901.0
9777     10000000.0
44373     9841049.0
3294      9496841.0
Name: price, dtype: float64

In [14]:
len(zil[zil.price < 2_500_000])/len(zil)

0.9888264154997672

In [15]:
zil = ac.zillow_data()
rename = {'bedroomcnt': 'bedrooms',  # Create a dictionary for new column names
            'bathroomcnt': 'bathrooms',
              'calculatedfinishedsquarefeet': 'sq_ft',
              'taxvaluedollarcnt': 'price'}
zil = zil.rename(columns=rename)  # Rename colums using dictionary

In [16]:
zil = zil[zil.price < 2_500_000]

In [17]:
zil = ac.zillow_data()
zil = prep.prep_zillow(zil)

In [18]:
zil.head()

Unnamed: 0,bedrooms,bathrooms,sq_ft,price
0,4,2.0,3633.0,296425.0
1,3,4.0,1620.0,847770.0
2,3,2.0,2077.0,646760.0
3,0,0.0,1200.0,5328.0
4,0,0.0,171.0,6920.0


In [19]:
zil, val, test = prep.train_val_test(zil)

In [20]:
zil.shape, val.shape, test.shape

((39013, 4), (8360, 4), (8360, 4))

# SECOND TRY

In [21]:
zil = ac.zillow_data()

In [22]:
zil = prep.prep_zillow(zil, mvp=False)

In [23]:
zil.dtypes

bedrooms       int64
bathrooms    float64
sq_ft        float64
price        float64
dtype: object

In [24]:
zil.head()

Unnamed: 0,bedrooms,bathrooms,sq_ft,price
0,4,2.0,3633.0,296425.0
1,3,4.0,1620.0,847770.0
2,3,2.0,2077.0,646760.0
3,0,0.0,1200.0,5328.0
4,0,0.0,171.0,6920.0


In [25]:
zil.bedrooms.value_counts()

3    25013
4    16572
2     8739
5     3946
6      626
1      587
0      131
7      119
Name: bedrooms, dtype: int64

In [26]:
zil.bathrooms.value_counts()

2.0    24532
3.0    11151
1.0    10626
2.5     3723
4.0     2144
1.5      792
5.0      775
3.5      736
4.5      505
6.0      293
5.5      164
7.0      154
0.0      138
Name: bathrooms, dtype: int64

In [27]:
train, val, test = prep.scale(zil, scaled_cols=['bedrooms', 'bathrooms', 'sq_ft'])

In [28]:
train.head()

Unnamed: 0,bedrooms,bathrooms,sq_ft,price
13867,0.285714,0.142857,0.174067,404049.0
20306,0.428571,0.285714,0.167236,538982.0
48331,0.285714,0.142857,0.146237,360751.0
13416,0.285714,0.142857,0.091335,176663.0
2951,0.571429,0.428571,0.355218,3113999.0


# Advanced Prepare

In [29]:
zil = ac.zillow_data(True)

In [32]:
zil.isna().sum()

bedroomcnt                          0
bathroomcnt                         0
calculatedfinishedsquarefeet      231
taxvaluedollarcnt                   8
poolcnt                         45128
garagecarcnt                    37832
fireplacecnt                    48448
fips                                0
yearbuilt                         249
numberofstories                 40586
lotsizesquarefeet                 350
dtype: int64

In [30]:
zil.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,poolcnt,garagecarcnt,fireplacecnt,fips,yearbuilt,numberofstories,lotsizesquarefeet
0,0.0,0.0,,27516.0,,,,6037.0,,,4083.0
1,0.0,0.0,,10.0,,,,6037.0,,,11975.0
2,0.0,0.0,,10.0,,,,6037.0,,,9403.0
3,0.0,0.0,,2108.0,,,,6037.0,,,3817.0
4,4.0,2.0,3633.0,296425.0,,,,6037.0,2005.0,,9826.0


In [34]:
from sklearn.impute import SimpleImputer


In [39]:
imputer = SimpleImputer(strategy='constant', fill_value=0)
imputer.fit(zil[['poolcnt']])
zil['poolcnt'] = imputer.transform(zil[['poolcnt']])

In [40]:
zil.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,poolcnt,garagecarcnt,fireplacecnt,fips,yearbuilt,numberofstories,lotsizesquarefeet
0,0.0,0.0,,27516.0,0.0,,,6037.0,,,4083.0
1,0.0,0.0,,10.0,0.0,,,6037.0,,,11975.0
2,0.0,0.0,,10.0,0.0,,,6037.0,,,9403.0
3,0.0,0.0,,2108.0,0.0,,,6037.0,,,3817.0
4,4.0,2.0,3633.0,296425.0,0.0,,,6037.0,2005.0,,9826.0


In [41]:
imputer.fit(zil[['garagecarcnt']])
zil['garagecarcnt'] = imputer.transform(zil[['garagecarcnt']])

In [42]:
imputer.fit(zil[['fireplacecnt']])
zil['fireplacecnt'] = imputer.transform(zil[['fireplacecnt']])

In [43]:
zil.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,poolcnt,garagecarcnt,fireplacecnt,fips,yearbuilt,numberofstories,lotsizesquarefeet
0,0.0,0.0,,27516.0,0.0,0.0,0.0,6037.0,,,4083.0
1,0.0,0.0,,10.0,0.0,0.0,0.0,6037.0,,,11975.0
2,0.0,0.0,,10.0,0.0,0.0,0.0,6037.0,,,9403.0
3,0.0,0.0,,2108.0,0.0,0.0,0.0,6037.0,,,3817.0
4,4.0,2.0,3633.0,296425.0,0.0,0.0,0.0,6037.0,2005.0,,9826.0


In [44]:
zil.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,poolcnt,garagecarcnt,fireplacecnt,fips,yearbuilt,numberofstories,lotsizesquarefeet
0,0.0,0.0,,27516.0,0.0,0.0,0.0,6037.0,,,4083.0
1,0.0,0.0,,10.0,0.0,0.0,0.0,6037.0,,,11975.0
2,0.0,0.0,,10.0,0.0,0.0,0.0,6037.0,,,9403.0
3,0.0,0.0,,2108.0,0.0,0.0,0.0,6037.0,,,3817.0
4,4.0,2.0,3633.0,296425.0,0.0,0.0,0.0,6037.0,2005.0,,9826.0
