In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Data getting, cleaning, and exploring
import wrangle as w
import evaluate as ev
import explore as ex

# Python without these is hard
import pandas as pd
import numpy as np
from pydataset import data
from scipy import stats

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
# Regression Modeling
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [2]:
df = w.wrangle_zillow()

In [3]:
df.shape

(71525, 36)

In [4]:
list(df.columns)

['heatingorsystemtypeid',
 'id',
 'bathroomcnt',
 'bedroomcnt',
 'buildingqualitytypeid',
 'calculatedbathnbr',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'fips',
 'fullbathcnt',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'propertycountylandusecode',
 'propertylandusetypeid',
 'propertyzoningdesc',
 'rawcensustractandblock',
 'regionidcity',
 'regionidcounty',
 'regionidzip',
 'roomcnt',
 'unitcnt',
 'yearbuilt',
 'structuretaxvaluedollarcnt',
 'taxvaluedollarcnt',
 'assessmentyear',
 'landtaxvaluedollarcnt',
 'taxamount',
 'censustractandblock',
 'logerror',
 'transactiondate',
 'heatingorsystemdesc',
 'la_cnty',
 'orange_cnty',
 'ventura_cnty',
 'log_error_class']

In [None]:
df.head(2)

In [None]:
binary_vars = ['la_cnty', 'orange_cnty', 'ventura_cnty']
categorical_vars = ['fips', 'prop_land_type_id']
quant_vars = [ 'bathrooms', 'bedrooms', 'property_sqft', 'latitude', 'longitude', 'lot_sqft', 'year_built', 'struct_tax_value', 'tax_value', 'land_tax_value', 'tax_amount']
categorical_target = 'log_error_class'
continuous_target = 'log_error'

In [None]:
train, validate, test = w.train_validate_test_split(df, 'log_error', seed=42)

In [None]:
train, validate, test = w.scale_my_data(train, validate, test, quant_vars)

In [None]:
train.shape, validate.shape, test.shape

In [None]:
train.head()

In [None]:
ex.explore_univariate(df, categorical_vars, quant_vars)

- Features with a lot of outliers:
    - bathrooms, property_sqft, lot_sqft, struct_tax_value, tax_value, land_tax_value, tax_amount, and log_error
- Bedrooms appear to be normally distributed
- Lat and long are good indicators of population density by location, also I shouldn't of Scaled lat and long.
- 


In [None]:
ex.run_stats_on_everything(train, categorical_target, continuous_target, binary_vars, quant_vars)

In [None]:
# plt.figure(figsize=(16,12))
# sns.heatmap(train.corr(), cmap='BuGn')
# plt.show()

In [None]:
ex.explore_bivariate(train, categorical_target, continuous_target, binary_vars, quant_vars)

In [None]:
# sns.violinplot(x='log_error_class', 
#                y='bathrooms', 
#                data=train, 
#                split=True, 
#                hue=['la_cnty','orange_cnty'], 
#                palette="Set2")

In [None]:
# sns.swarmplot(x='log_error_class', y='bathrooms', data=train, split=True, palette="Set2")

In [None]:
plt.figure(figsize=(80,60))

sns.scatterplot(data=train, x='longitude', y='latitude', hue=categorical_target)