In [6]:
#import statements
import pandas as pd
import numpy as np
import statsmodels.api as stats
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
#opening data file
df = pd.read_csv("./data/kc_house_data.csv")

In [8]:
#taking a look at the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  object 
 9   view           21534 non-null  object 
 10  condition      21597 non-null  object 
 11  grade          21597 non-null  object 
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

"waterfront", "view", and "yr_renovated" have some nulls.

In [9]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,NONE,...,7 Average,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,7 Average,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,NO,NONE,...,6 Low Average,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,NO,NONE,...,7 Average,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,NO,NONE,...,8 Good,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


In [10]:
print(df['waterfront'].isna().sum())
df['waterfront'].value_counts()

2376


NO     19075
YES      146
Name: waterfront, dtype: int64

In [11]:
na_waterfront = df[df['waterfront'].isna()]
na_waterfront

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.00,1180,5650,1.0,,NONE,...,7 Average,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
10,1736800520,4/3/2015,662500.0,3,2.50,3560,9796,1.0,,NONE,...,8 Good,1860,1700.0,1965,0.0,98007,47.6007,-122.145,2210,8925
23,8091400200,5/16/2014,252700.0,2,1.50,1070,9643,1.0,,NONE,...,7 Average,1070,0.0,1985,,98030,47.3533,-122.166,1220,8386
40,5547700270,7/15/2014,625000.0,4,2.50,2570,5520,2.0,,NONE,...,9 Better,2570,0.0,2000,,98074,47.6145,-122.027,2470,5669
55,9822700295,5/12/2014,885000.0,4,2.50,2830,5000,2.0,,NONE,...,9 Better,2830,0.0,1995,0.0,98105,47.6597,-122.290,1950,5000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21578,5087900040,10/17/2014,350000.0,4,2.75,2500,5995,2.0,,NONE,...,8 Good,2500,0.0,2008,0.0,98042,47.3749,-122.107,2530,5988
21582,8956200760,10/13/2014,541800.0,4,2.50,3118,7866,2.0,,AVERAGE,...,9 Better,3118,0.0,2014,0.0,98001,47.2931,-122.264,2673,6500
21586,844000965,6/26/2014,224000.0,3,1.75,1500,11968,1.0,,NONE,...,6 Low Average,1500,0.0,2014,0.0,98010,47.3095,-122.002,1320,11303
21587,7852140040,8/25/2014,507250.0,3,2.50,2270,5536,2.0,,NONE,...,8 Good,2270,0.0,2003,0.0,98065,47.5389,-121.881,2270,5731


In [12]:
na_waterfront['view'].value_counts()

NONE         2110
AVERAGE       121
GOOD           73
FAIR           39
EXCELLENT      27
Name: view, dtype: int64

In [13]:
#comparing view values for waterfront = YES
yes_waterfront = df[df['waterfront']=="YES"]
yes_waterfront['view'].value_counts()

EXCELLENT    123
GOOD          14
AVERAGE        7
FAIR           1
Name: view, dtype: int64

Any property with a waterfront will have a non-NONE view. Since there are 2110 NONE views in our NA waterfront set, it seems safer to assume NO as the default for waterfront.

In [14]:
view_waterfront = df[(df['waterfront'].isna()) & (df['view'] == "NONE")]
view_waterfront

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.00,1180,5650,1.0,,NONE,...,7 Average,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
10,1736800520,4/3/2015,662500.0,3,2.50,3560,9796,1.0,,NONE,...,8 Good,1860,1700.0,1965,0.0,98007,47.6007,-122.145,2210,8925
23,8091400200,5/16/2014,252700.0,2,1.50,1070,9643,1.0,,NONE,...,7 Average,1070,0.0,1985,,98030,47.3533,-122.166,1220,8386
40,5547700270,7/15/2014,625000.0,4,2.50,2570,5520,2.0,,NONE,...,9 Better,2570,0.0,2000,,98074,47.6145,-122.027,2470,5669
55,9822700295,5/12/2014,885000.0,4,2.50,2830,5000,2.0,,NONE,...,9 Better,2830,0.0,1995,0.0,98105,47.6597,-122.290,1950,5000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21567,2025049203,6/10/2014,399950.0,2,1.00,710,1157,2.0,,NONE,...,7 Average,710,0.0,1943,0.0,98102,47.6413,-122.329,1370,1173
21578,5087900040,10/17/2014,350000.0,4,2.75,2500,5995,2.0,,NONE,...,8 Good,2500,0.0,2008,0.0,98042,47.3749,-122.107,2530,5988
21586,844000965,6/26/2014,224000.0,3,1.75,1500,11968,1.0,,NONE,...,6 Low Average,1500,0.0,2014,0.0,98010,47.3095,-122.002,1320,11303
21587,7852140040,8/25/2014,507250.0,3,2.50,2270,5536,2.0,,NONE,...,8 Good,2270,0.0,2003,0.0,98065,47.5389,-121.881,2270,5731


In [15]:
df['waterfront'].fillna("NO", inplace=True)

In [16]:
print(df['yr_renovated'].isna().sum())
df['yr_renovated'].value_counts()

3842


0.0       17011
2014.0       73
2003.0       31
2013.0       31
2007.0       30
          ...  
1946.0        1
1959.0        1
1971.0        1
1951.0        1
1954.0        1
Name: yr_renovated, Length: 70, dtype: int64

The "yr_renovated" column has 3842 null values and thousands of 0.0 values. These 0.0 values might be an indicator for a house that has never been renovated.

In [17]:
df['yr_renovated'].fillna(0, inplace=True)

In [18]:
df['was_renovated'] = df['yr_renovated'] != 0.0

In [19]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,was_renovated
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,NO,NONE,...,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650,False
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639,True
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,NO,NONE,...,770,0.0,1933,0.0,98028,47.7379,-122.233,2720,8062,False
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,NO,NONE,...,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000,False
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,NO,NONE,...,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503,False


In [20]:
df['yr_renovated'].value_counts()

0.0       20853
2014.0       73
2003.0       31
2013.0       31
2007.0       30
          ...  
1946.0        1
1959.0        1
1971.0        1
1951.0        1
1954.0        1
Name: yr_renovated, Length: 70, dtype: int64

In [21]:
df['yr_built'].describe()

count    21597.000000
mean      1970.999676
std         29.375234
min       1900.000000
25%       1951.000000
50%       1975.000000
75%       1997.000000
max       2015.000000
Name: yr_built, dtype: float64

In [22]:
renovation = df[df['yr_renovated'] > 0]
renovation['yr_renovated'].describe()

count     744.000000
mean     1995.928763
std        15.599946
min      1934.000000
25%      1987.000000
50%      2000.000000
75%      2007.250000
max      2015.000000
Name: yr_renovated, dtype: float64

In [23]:
#replace yr_renovated == 0 with the associated year in yr_built

df.loc[df['yr_renovated'] == 0, ['yr_renovated']] = df['yr_built']

In [24]:
df['yr_renovated'].value_counts()

2014.0    632
2005.0    479
2006.0    473
2004.0    455
2003.0    450
         ... 
1901.0     28
1902.0     26
1933.0     24
1935.0     20
1934.0     15
Name: yr_renovated, Length: 116, dtype: int64

In [25]:
#sanity check 
df[df['was_renovated'] == False]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,was_renovated
0,7129300520,10/13/2014,221900.0,3,1.00,1180,5650,1.0,NO,NONE,...,1180,0.0,1955,1955.0,98178,47.5112,-122.257,1340,5650,False
2,5631500400,2/25/2015,180000.0,2,1.00,770,10000,1.0,NO,NONE,...,770,0.0,1933,1933.0,98028,47.7379,-122.233,2720,8062,False
3,2487200875,12/9/2014,604000.0,4,3.00,1960,5000,1.0,NO,NONE,...,1050,910.0,1965,1965.0,98136,47.5208,-122.393,1360,5000,False
4,1954400510,2/18/2015,510000.0,3,2.00,1680,8080,1.0,NO,NONE,...,1680,0.0,1987,1987.0,98074,47.6168,-122.045,1800,7503,False
5,7237550310,5/12/2014,1230000.0,4,4.50,5420,101930,1.0,NO,NONE,...,3890,1530.0,2001,2001.0,98053,47.6561,-122.005,4760,101930,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,5/21/2014,360000.0,3,2.50,1530,1131,3.0,NO,NONE,...,1530,0.0,2009,2009.0,98103,47.6993,-122.346,1530,1509,False
21593,6600060120,2/23/2015,400000.0,4,2.50,2310,5813,2.0,NO,NONE,...,2310,0.0,2014,2014.0,98146,47.5107,-122.362,1830,7200,False
21594,1523300141,6/23/2014,402101.0,2,0.75,1020,1350,2.0,NO,NONE,...,1020,0.0,2009,2009.0,98144,47.5944,-122.299,1020,2007,False
21595,291310100,1/16/2015,400000.0,3,2.50,1600,2388,2.0,NO,NONE,...,1600,0.0,2004,2004.0,98027,47.5345,-122.069,1410,1287,False


In [26]:
df['was_renovated'].value_counts()

False    20853
True       744
Name: was_renovated, dtype: int64

In [27]:
print(df['view'].isna().sum())
df['view'].value_counts()

63


NONE         19422
AVERAGE        957
GOOD           508
FAIR           330
EXCELLENT      317
Name: view, dtype: int64

In [28]:
null_view = df[df['view'].isna()]
null_view['waterfront'].value_counts()

NO     62
YES     1
Name: waterfront, dtype: int64

In [29]:
df['view'].fillna("NONE", inplace=True)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     21597 non-null  object 
 9   view           21597 non-null  object 
 10  condition      21597 non-null  object 
 11  grade          21597 non-null  object 
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   21597 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

In [31]:
#sqft_basement is an object datatype
df['sqft_basement'].value_counts()

0.0       12826
?           454
600.0       217
500.0       209
700.0       208
          ...  
207.0         1
1798.0        1
2490.0        1
784.0         1
1548.0        1
Name: sqft_basement, Length: 304, dtype: int64

In [32]:
question_basement = df[df['sqft_basement'] == "?"]
question_basement

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,was_renovated
6,1321400060,6/27/2014,257500.0,3,2.25,1715,6819,2.0,NO,NONE,...,1715,?,1995,1995.0,98003,47.3097,-122.327,2238,6819,False
18,16000397,12/5/2014,189000.0,2,1.00,1200,9850,1.0,NO,NONE,...,1200,?,1921,1921.0,98002,47.3089,-122.210,1060,5095,False
42,7203220400,7/7/2014,861990.0,5,2.75,3595,5639,2.0,NO,NONE,...,3595,?,2014,2014.0,98053,47.6848,-122.016,3625,5639,False
79,1531000030,3/23/2015,720000.0,4,2.50,3450,39683,2.0,NO,NONE,...,3450,?,2002,2002.0,98010,47.3420,-122.025,3350,39750,False
112,2525310310,9/16/2014,272500.0,3,1.75,1540,12600,1.0,NO,NONE,...,1160,?,1980,1980.0,98038,47.3624,-122.031,1540,11656,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21442,3226049565,7/11/2014,504600.0,5,3.00,2360,5000,1.0,NO,NONE,...,1390,?,2008,2008.0,98103,47.6931,-122.330,2180,5009,False
21447,1760650900,7/21/2014,337500.0,4,2.50,2330,4907,2.0,NO,NONE,...,2330,?,2013,2013.0,98042,47.3590,-122.081,2300,3836,False
21473,6021503707,1/20/2015,352500.0,2,2.50,980,1010,3.0,NO,NONE,...,980,?,2008,2008.0,98117,47.6844,-122.387,980,1023,False
21519,2909310100,10/15/2014,332000.0,4,2.50,2380,5737,2.0,NO,NONE,...,2380,?,2010,2010.0,98023,47.2815,-122.356,2380,5396,False


In [33]:
df['sqft_basement2'] = df['sqft_living'] - df['sqft_above']
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,was_renovated,sqft_basement2
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,NO,NONE,...,0.0,1955,1955.0,98178,47.5112,-122.257,1340,5650,False,0
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639,True,400
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,NO,NONE,...,0.0,1933,1933.0,98028,47.7379,-122.233,2720,8062,False,0
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,NO,NONE,...,910.0,1965,1965.0,98136,47.5208,-122.393,1360,5000,False,910
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,NO,NONE,...,0.0,1987,1987.0,98074,47.6168,-122.045,1800,7503,False,0


In [36]:
df['sqft_basement2'].value_counts()

0       13110
600       221
700       218
500       214
800       206
        ...  
792         1
2590        1
935         1
2390        1
248         1
Name: sqft_basement2, Length: 306, dtype: int64

In [None]:
#454 counts of ? values for sqft_basement
#convert these to 0.0 
#df[df['sqft_basement'] == "?"] = "0.0"

#df['sqft_basement'] = df['sqft_basement'].replace("?", "0.0").astype(float)


In [None]:
#check that ? has been replaced with 0.0
df['sqft_basement'].value_counts()

In [51]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,was_renovated,sqft_basement2
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,NO,NONE,...,0.0,1955,1955.0,98178,47.5112,-122.257,1340,5650,False,0
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639,True,400
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,NO,NONE,...,0.0,1933,1933.0,98028,47.7379,-122.233,2720,8062,False,0
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,NO,NONE,...,910.0,1965,1965.0,98136,47.5208,-122.393,1360,5000,False,910
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,NO,NONE,...,0.0,1987,1987.0,98074,47.6168,-122.045,1800,7503,False,0


In [None]:
check_zero_basement = df[df['sqft_basement'] == "700.0"]
check_zero_basement.shape
#why are these strings?

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              21597 non-null  int64  
 1   date            21597 non-null  object 
 2   price           21597 non-null  float64
 3   bedrooms        21597 non-null  int64  
 4   bathrooms       21597 non-null  float64
 5   sqft_living     21597 non-null  int64  
 6   sqft_lot        21597 non-null  int64  
 7   floors          21597 non-null  float64
 8   waterfront      21597 non-null  object 
 9   view            21597 non-null  object 
 10  condition       21597 non-null  object 
 11  grade           21597 non-null  object 
 12  sqft_above      21597 non-null  int64  
 13  sqft_basement   21597 non-null  object 
 14  yr_built        21597 non-null  int64  
 15  yr_renovated    21597 non-null  float64
 16  zipcode         21597 non-null  int64  
 17  lat             21597 non-null 

In [44]:
df['sqft_living'].value_counts()

1300    138
1400    135
1440    133
1660    129
1010    129
       ... 
4970      1
2905      1
2793      1
4810      1
1975      1
Name: sqft_living, Length: 1034, dtype: int64

In [45]:
df['sqft_lot'].value_counts()

5000      358
6000      290
4000      251
7200      220
7500      119
         ... 
1448        1
38884       1
17313       1
35752       1
315374      1
Name: sqft_lot, Length: 9776, dtype: int64

In [None]:
print(df['view'].isna().sum())
df['view'].value_counts()

In [46]:
df['sqft_above'].value_counts()

1300    212
1010    210
1200    206
1220    192
1140    184
       ... 
2601      1
440       1
2473      1
2441      1
1975      1
Name: sqft_above, Length: 942, dtype: int64

In [47]:
df['sqft_basement'].value_counts()

0.0       12826
?           454
600.0       217
500.0       209
700.0       208
          ...  
207.0         1
1798.0        1
2490.0        1
784.0         1
1548.0        1
Name: sqft_basement, Length: 304, dtype: int64

In [48]:
df['yr_built'].value_counts()

2014    559
2006    453
2005    450
2004    433
2003    420
       ... 
1933     30
1901     29
1902     27
1935     24
1934     21
Name: yr_built, Length: 116, dtype: int64

In [49]:
df['yr_renovated'].value_counts()

2014.0    632
2005.0    479
2006.0    473
2004.0    455
2003.0    450
         ... 
1901.0     28
1902.0     26
1933.0     24
1935.0     20
1934.0     15
Name: yr_renovated, Length: 116, dtype: int64

In [37]:
df['bedrooms'].value_counts()

3     9824
4     6882
2     2760
5     1601
6      272
1      196
7       38
8       13
9        6
10       3
11       1
33       1
Name: bedrooms, dtype: int64

In [41]:
df[df['bedrooms'] > 8]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,was_renovated,sqft_basement2
4092,1997200215,5/7/2014,599999.0,9,4.5,3830,6988,2.5,NO,NONE,...,1380.0,1938,1938.0,98103,47.6927,-122.338,1460,6291,False,1380
4231,2902200015,1/6/2015,700000.0,9,3.0,3680,4400,2.0,NO,NONE,...,850.0,1908,1908.0,98102,47.6374,-122.324,1960,2450,False,850
6073,9822700190,8/8/2014,1280000.0,9,4.5,3650,5000,2.0,NO,NONE,...,1120.0,1915,2010.0,98105,47.6604,-122.289,2510,5000,True,1120
8537,424049043,8/11/2014,450000.0,9,7.5,4050,6504,2.0,NO,NONE,...,0.0,1996,1996.0,98144,47.5923,-122.301,1448,3866,False,0
8748,1773100755,8/21/2014,520000.0,11,3.0,3000,4960,2.0,NO,NONE,...,600.0,1918,1999.0,98106,47.556,-122.363,1420,4960,True,600
13301,627300145,8/14/2014,1150000.0,10,5.25,4590,10920,1.0,NO,AVERAGE,...,2090.0,2008,2008.0,98004,47.5861,-122.113,2730,10400,False,2090
15147,5566100170,10/29/2014,650000.0,10,2.0,3610,11914,2.0,NO,NONE,...,600.0,1958,1958.0,98006,47.5705,-122.175,2040,11914,False,600
15856,2402100895,6/25/2014,640000.0,33,1.75,1620,6000,1.0,NO,NONE,...,580.0,1947,1947.0,98103,47.6878,-122.331,1330,4700,False,580
16830,8823900290,3/17/2015,1400000.0,9,4.0,4620,5508,2.5,NO,NONE,...,750.0,1915,1915.0,98105,47.6684,-122.309,2710,4320,False,750
18428,8823901445,3/13/2015,934000.0,9,3.0,2820,4480,2.0,NO,NONE,...,940.0,1918,1918.0,98105,47.6654,-122.307,2460,4400,False,940


In [42]:
df['bedrooms'] = df['bedrooms'].replace(33, 3)

In [43]:
df['bedrooms'].value_counts()

3     9825
4     6882
2     2760
5     1601
6      272
1      196
7       38
8       13
9        6
10       3
11       1
Name: bedrooms, dtype: int64

In [38]:
df['bathrooms'].value_counts()

2.50    5377
1.00    3851
1.75    3048
2.25    2047
2.00    1930
1.50    1445
2.75    1185
3.00     753
3.50     731
3.25     589
3.75     155
4.00     136
4.50     100
4.25      79
0.75      71
4.75      23
5.00      21
5.25      13
5.50      10
1.25       9
6.00       6
5.75       4
0.50       4
8.00       2
6.25       2
6.75       2
6.50       2
7.50       1
7.75       1
Name: bathrooms, dtype: int64

The number 454 keeps appearing in our value counts for the value of 0.0. If these 0.0 values are all in the same rows, it might indicate that there is missing data for those particular properties.

In [None]:
no_sqftliving = df[df["sqft_living"] == 0.0]
no_sqftliving.head()

These rows seem like extraneous entries. We are going to remove them since these rows are essentially meaningless.

id is a good column to filter out these rows by. A house might have a sqft_basement of 0.0 indicating no basement, but a id of 0.0 is indicative of an extraneous row.

In [None]:
df["id"].value_counts()

In [None]:
no_id = df[df["id"] == 0.0]
no_id.shape

In [None]:
no_id.head()

As we gleaned earlier, there are 454 rows of complete 0.0 values. 

In [None]:
df_cleaned = df.drop(df[df['id'] == 0.0].index)
df_cleaned.head()

In [None]:
#check to see if 454 rows have been dropped
df_cleaned.shape

Our cleaned dataframe has 21143 rows. Our starting dataframe had 21597 rows, which is a 454 row difference. 

In [53]:
#checking id values to confirm there are no more 0.0
df['id'].value_counts()

795000620     3
1825069031    2
2019200220    2
7129304540    2
1781500435    2
             ..
7812801125    1
4364700875    1
3021059276    1
880000205     1
1777500160    1
Name: id, Length: 21420, dtype: int64

There are multiple entries with the same id. These are probably the same properties under multiple transactions. We should check an example to see if this is the case.

In [52]:
#id = 795000620 appears 3 times in our dataset
multiple_id = df[df['id'] == 795000620]
multiple_id

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,was_renovated,sqft_basement2
17588,795000620,9/24/2014,115000.0,3,1.0,1080,6250,1.0,NO,NONE,...,0.0,1950,1950.0,98168,47.5045,-122.33,1070,6250,False,0
17589,795000620,12/15/2014,124000.0,3,1.0,1080,6250,1.0,NO,NONE,...,0.0,1950,1950.0,98168,47.5045,-122.33,1070,6250,False,0
17590,795000620,3/11/2015,157000.0,3,1.0,1080,6250,1.0,NO,NONE,...,0.0,1950,1950.0,98168,47.5045,-122.33,1070,6250,False,0


There are some differences in the entries with the same id. We can see the date is different as we should expect if every row is an individual transaction. The price is also different which makes sense. The "waterfront" column has both "NO" and NaN values, reinforcing the idea that we should treat NaN in that column as "NO". The "yr_renovated" column also shows the same behavior.

In [None]:
#replace nulls
#waterfront: null -> NO
#yr_renovated: null -> 0
#view: null -> NONE

df_cleaned['waterfront'].fillna("NO", inplace=True)
df_cleaned['yr_renovated'].fillna(0, inplace=True)
df_cleaned['view'].fillna("NONE", inplace=True)

In [None]:
df_cleaned.info()

All nulls have been replaced with appropriate alternate values.

In [None]:
df_cleaned['price'].describe()

In [None]:
fig,ax = plt.subplots()
ax.hist(df_cleaned['price'], bins = 75);

The histogram of price shows a right-skewed distribution.