In [22]:
import pandas as pd
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', None) 

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

from scipy.stats import kurtosis, skew
import scipy.stats as stats

from math import sqrt

import statsmodels.formula.api as smf

###  import clean data

In [23]:
df = pd.read_csv("cleaned_data_kc_house_data")

In [24]:
df.head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639


### import zipcode matched with town csv

In [32]:
df1 = pd.read_csv("zipcode_town.csv")

In [33]:
df1.head()

Unnamed: 0,zipcode,town
0,98001,Auburn
1,98002,Auburn
2,98003,Federal Way
3,98004,Bellevue
4,98005,Bellevue


### merge both dataframes on zip code and pull in town

In [34]:
df2 = df.merge(df1, on = "zipcode", how = "left")

In [35]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21082 entries, 0 to 21081
Data columns (total 22 columns):
id               21082 non-null int64
date             21082 non-null object
price            21082 non-null int64
bedrooms         21082 non-null int64
bathrooms        21082 non-null float64
sqft_living      21082 non-null int64
sqft_lot         21082 non-null int64
floors           21082 non-null float64
waterfront       21082 non-null int64
view             21082 non-null int64
condition        21082 non-null int64
grade            21082 non-null int64
sqft_above       21082 non-null int64
sqft_basement    21082 non-null int64
yr_built         21082 non-null int64
yr_renovated     21082 non-null int64
zipcode          21082 non-null int64
lat              21082 non-null float64
long             21082 non-null float64
sqft_living15    21082 non-null int64
sqft_lot15       21082 non-null int64
town             21082 non-null object
dtypes: float64(4), int64(16), object(2)
memo

In [38]:
df2.tail()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,town
21077,263000018,5/21/2014,360000,3,2.5,1530,1131,3.0,0,0,3,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509,Seattle
21078,6600060120,2/23/2015,400000,4,2.5,2310,5813,2.0,0,0,3,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200,Seattle
21079,1523300141,6/23/2014,402101,2,0.75,1020,1350,2.0,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007,Seattle
21080,291310100,1/16/2015,400000,3,2.5,1600,2388,2.0,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287,Issaquah
21081,1523300157,10/15/2014,325000,2,0.75,1020,1076,2.0,0,0,3,7,1020,0,2008,0,98144,47.5941,-122.299,1020,1357,Seattle


### remove zipcode

In [49]:
df3 = df2.drop(columns = "zipcode")

In [50]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21082 entries, 0 to 21081
Data columns (total 21 columns):
id               21082 non-null int64
date             21082 non-null object
price            21082 non-null int64
bedrooms         21082 non-null int64
bathrooms        21082 non-null float64
sqft_living      21082 non-null int64
sqft_lot         21082 non-null int64
floors           21082 non-null float64
waterfront       21082 non-null int64
view             21082 non-null int64
condition        21082 non-null int64
grade            21082 non-null int64
sqft_above       21082 non-null int64
sqft_basement    21082 non-null int64
yr_built         21082 non-null int64
yr_renovated     21082 non-null int64
lat              21082 non-null float64
long             21082 non-null float64
sqft_living15    21082 non-null int64
sqft_lot15       21082 non-null int64
town             21082 non-null object
dtypes: float64(4), int64(15), object(2)
memory usage: 4.2+ MB


In [52]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21082 entries, 0 to 21081
Data columns (total 21 columns):
id               21082 non-null int64
date             21082 non-null object
price            21082 non-null int64
bedrooms         21082 non-null int64
bathrooms        21082 non-null float64
sqft_living      21082 non-null int64
sqft_lot         21082 non-null int64
floors           21082 non-null float64
waterfront       21082 non-null int64
view             21082 non-null int64
condition        21082 non-null int64
grade            21082 non-null int64
sqft_above       21082 non-null int64
sqft_basement    21082 non-null int64
yr_built         21082 non-null int64
yr_renovated     21082 non-null int64
lat              21082 non-null float64
long             21082 non-null float64
sqft_living15    21082 non-null int64
sqft_lot15       21082 non-null int64
town             21082 non-null category
dtypes: category(1), float64(4), int64(15), object(1)
memory usage: 4.0+ MB


### convert categorical types to category and date to datetime

In [53]:
df3.date = df.date.astype('datetime64')
df3.bedrooms = df.bedrooms.astype('category')
df3.bathrooms = df.bathrooms.astype('category')
df3.floors = df.floors.astype('category')
df3.waterfront = df.waterfront.astype('category')
df3.view = df.view.astype('category')
df3.condition = df.condition.astype('category')
df3.grade = df.grade.astype('category')
df3.town = df3.town.astype("category")
