In [1]:
# Your code here - remember to use markdown cells for comments as well!
import os
import re
import math
import json

import pandas as pd
import numpy as np

from scipy import stats, linalg

from sklearn import linear_model
from sklearn import neighbors
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error


import missingno as msno

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
# set style
sns.set_style('whitegrid')
# overriding font size and line width
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

# map visualization
import folium

# don't print matching warnings
import warnings
warnings.filterwarnings('ignore') 



In [2]:
# read data and read date correctly
kc_df = pd.read_csv("kc_house_data.csv", parse_dates = ['date'])

In [None]:
fig = plt.figure(figsize=(16, 8))
sns.heatmap(kc_df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
sns.distplot(kc_df.sqft_living15, bins=50, kde=True, label='sqft_living15')

In [None]:
sns.distplot(kc_df.price, bins=50, kde=True, label='price')

In [None]:

kc_df['yr_built'] = pd.to_datetime(kc_df['yr_built'])
kc_df['yr_built'] = kc_df['yr_built'].dt.year
kc_df.head()
# kc_df['yr_renovated'] = pd.to_datetime(kc_df['yr_renovated']).dt.year

In [None]:
kc_df.shape

In [None]:
# get the mean value across all data points
zipcode_data = kc_df.groupby('zipcode').aggregate(np.mean)
zipcode_data.reset_index(inplace = True)
zipcode_data = zipcode_data[['zipcode']]

In [None]:
temp = kc_df.groupby('zipcode', as_index=False)['price'].aggregate(np.mean)
temp.price = np.floor(temp.price)
zipcode_data = pd.merge(zipcode_data, temp, on='zipcode')

In [None]:
temp = kc_df.groupby('zipcode', as_index=False)['sqft_living15'].aggregate(np.mean)
temp.sqft_living15 = np.floor(temp.sqft_living15)
zipcode_data = pd.merge(zipcode_data, temp, on='zipcode')

In [None]:
# count number of entries grouped by zipcode
kc_df['count'] = 1

temp_df = kc_df.groupby('zipcode').sum()
temp_df.reset_index(inplace = True)
temp_df = temp_df[['zipcode', 'count']]

kc_df.drop(['count'], axis = 1, inplace = True)

zipcode_data = pd.merge(zipcode_data, temp_df, on='zipcode')

In [None]:
zipcode_data.head(2)

In [None]:
del temp_df

In [None]:
kc_df['yr_built'] = pd.to_datetime(kc_df['yr_built']).dt.year
# kc_df['yr_renovated'] = pd.to_datetime(kc_df['yr_renovated']).dt.year

kc_df.head()

In [None]:
kc_df['count'] = 1
kc_df.groupby('zipcode').sum()



In [None]:
# total = kc_df.isnull().sum().sort_values(ascending=False)

# calc percent of total null values

# missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# missing_data.head()

In [None]:
kc_df.dtypes

In [None]:
kc_df.head()

In [None]:
kc_df.shape

In [None]:
# Display all missing data
msno.matrix(kc_df)

In [None]:
# Set zipcode type to string
kc_df['zipcode'] = kc_df['zipcode'].astype('str')

# Convert sqft_basement to int
kc_df['sqft_basement'] = pd.to_numeric(kc_df['sqft_basement'], errors='coerce')

# set category and order them
kc_df['grade'] = kc_df['grade'].astype('category', ordered = False)
# data['waterfront'] = data['waterfront'].astype('category', ordered = True)
kc_df['condition'] = kc_df['condition'].astype('category', ordered = True)

# Drop columns which are not in the offical dataset (kaggle) and might mess up our stuff
# data.drop(columns=['sqft_living15', 'sqft_lot15'], inplace=True)

# sort data by date
kc_df = kc_df.sort_values(by = ['date'])

In [None]:
# get the mean value across all data points
zipcode_data = kc_df.groupby('zipcode').aggregate(np.mean)
zipcode_data.reset_index(inplace = True)

# count number of entries grouped by zipcode
kc_df['count'] = 1

temp_df = kc_df.groupby('zipcode').sum()
temp_df.reset_index(inplace = True)
temp_df = temp_df[['zipcode', 'count']]

# kc_df.drop(['count'], axis = 1, inplace = True)

# merge the count values into the zipcode data
zipcode_data = pd.merge(zipcode_data, temp_df, how='left', on=['zipcode'])
zipcode_data.head(2)

In [None]:
# Get geo data file path
geo_data_file = os.path.join('data', 'king_county_wa_zipcode_area.geojson')

# load GeoJSON
with open(geo_data_file, 'r') as jsonFile:
    geo_data = json.load(jsonFile)
    
tmp = geo_data

# remove ZIP codes not in geo data
geozips = []
for i in range(len(tmp['features'])):
    if tmp['features'][i]['properties']['ZIPCODE'] in list(zipcode_data['zipcode'].unique()):
        geozips.append(tmp['features'][i])

# creating new JSON object
new_json = dict.fromkeys(['type','features'])
new_json['type'] = 'FeatureCollection'
new_json['features'] = geozips

# save uodated JSON object
open("cleaned_geodata.json", "w").write(
    json.dumps(new_json, sort_keys=True, indent=4, separators=(',', ': '))
)

In [None]:
def show_zipcode_map(col):
    # read updated geo data
    king_geo = "cleaned_geodata.json"
    
    # Initialize Folium Map with Seattle latitude and longitude
    m = folium.Map(location=[47.35, -121.9], zoom_start=9, 
        detect_retina = True, control_scale=False)

    # Create choropleth map
    m.choropleth(
        geo_data = king_geo,
        name = 'choropleth',
        data = zipcode_data,
        # col: feature of interest
        columns = ['zipcode', 'count'], 
        key_on = 'feature.properties.ZIPCODE',
        fill_color = 'OrRd', 
        fill_opacity = 0.7,
        line_opacity = 0.2,
        legend_name = 'count'
    )

    folium.LayerControl().add_to(m)
    # Save map based on feature of interest
    m.save(col + '.html')
    
    return m
    
show_zipcode_map('count')

In [None]:
data = kc_df.copy()
data.drop(['id', 'date'], axis=1, inplace=True)

In [None]:
scaler = preprocessing.RobustScaler()
robust_scaled_df = scaler.fit_transform(data)
robust_scaled_df = pd.DataFrame(robust_scaled_df)

In [None]:
import statsmodels.api as sms

X = data.sqft_living15
X = sms.add_constant(X)
y = data.price

model = sms.OLS(y, X).fit()
model.summary()

In [None]:
kc_df['zipcode'].unique()

In [None]:
cols = ['price', 'sqft_living15', 'sqft_lot15', 'count', 'yr_built', 'zipcode']
sns.pairplot(kc_df[cols], hue = 'price', diag_kind = 'kde',
             plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'},
             size = 4)

In [None]:
lat = np.array(kc_df.lat, dtype=pd.Series)
lon = np.array(kc_df.long, dtype=pd.Series)
mag = np.array(kc_df.price, dtype=pd.Series)/10000000

d = np.dstack((lat, lon, mag))[0]
heatmap_data = [i for i in d.tolist()]

In [None]:
import pandas as pd 
import folium
from folium.plugins import HeatMap

hmap = folium.Map(location=[47.55, -122.0], zoom_start=10, tiles='stamentoner')

hm_wide = HeatMap( heatmap_data,
                   min_opacity=0.7,
                   max_val=mg.max(),
                   radius=2, blur=2, 
                   max_zoom=1, 
                 )

hmap.add_child(hm_wide)

In [3]:
import surprise