- Intro
- Acquire
- Prepare
- Univariate Explore
- Split
- Explore/Stats
- Kmeans, binning, feature engineering
    combining features:
    - rooms (bed, bath) - ratio in one col?
    - size (square, lot, garage)
    - extras (pool, garage)
    - year
    - location & (code)
- Additional explore/stats
- takeaways about log error drivers
- modeling pre (different codes)
- modeling
- model summary
- conclusion

### INTRO

### IMPORTS/ACQUIRE

In [None]:
# Imports necessary to acquire, prepare, explore, visualize, analyze, and model data

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import env
import acquire
import prepare

from tabulate import tabulate
from IPython.display import Markdown, display

from math import sqrt
from scipy import stats

import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

np.random.seed(123)

In [None]:
df = acquire.get_zillow_data()
df = prepare.remove_outliers(df)
df = prepare.handle_nulls(df)
df = prepare.rename_columns(df)
df = prepare.prepare_locs(df)

### note: add to prepare...

In [None]:
df = df[(df.propertycountylandusecode == '0100') | (df.propertycountylandusecode == '122') | (df.propertycountylandusecode == '0101') | (df.propertycountylandusecode == '1111') | (df.propertycountylandusecode == '1') | (df.propertycountylandusecode == '1110') | (df.propertycountylandusecode == '0104')]

### note : talk about % of data still here, what was dropped, why, handling nulls and outliers

### SPLIT
### note: make in function

In [None]:
train, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train, test_size=.3, random_state=123)
# Splits data into 3 subsets: train, validate, test. Random state specifying that data is split
# with the exact same records when the code is re-run (useful for exploration and modeling, yet
# I suggest dropping this when employing the model)

def printmd(string): # function to format text style
    display(Markdown(string))
print()
printmd('**Number of Train Records:** {:,}'
     .format(len(train)))
printmd('**Number of Validate Records:** {:,}'
     .format(len(validate)))
printmd('**Number of Test Records:** {:,}'
     .format(len(test)))
# returns the amount of records in each subset after splitting

### EXPLORE

- Ask questions
- what is driving logerror?
- hypothesize homes that don't quite fit normal patterns(take drivers from tax value into account) - ie. older home that has a very high value and/or square feet, very small home with high value, strange ratio of bed to bath
- does location impact error (are some locations harder to predict?)

### STATS

### Feature Eng

In [None]:
from sklearn.cluster import KMeans
import sklearn.preprocessing

In [None]:
X = train[['bedrooms', 'bathrooms']]
XVAL = validate[['bedrooms', 'bathrooms']]
XTEST = test[['bedrooms', 'bathrooms']]

kmeans = KMeans(n_clusters=6)
kmeans.fit(X)

kmeans.predict(X)

train['room_cluster'] = kmeans.predict(X)
validate['room_cluster'] = kmeans.predict(XVAL)
test['room_cluster'] = kmeans.predict(XTEST)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(train[['square_feet', 'lot_size']])
train_s = (train[['square_feet', 'lot_size']])
X = scaler.transform(train[['square_feet', 'lot_size']])
X = pd.DataFrame(X, index=train_s.index, columns=train_s.columns)

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)

kmeans.predict(X)

train['size_cluster'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
from sklearn.cluster import AgglomerativeClustering

%matplotlib inline
np.set_printoptions(precision=5, suppress=True)

In [None]:
num_vars = train[['square_feet','lot_size','taxvaluedollarcnt','landtaxvaluedollarcnt']]

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(num_vars)
train_s = (num_vars)
X = scaler.transform(num_vars)
X = pd.DataFrame(X, index=train_s.index, columns=train_s.columns)

In [None]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()

X[X.columns] = normalizer.fit_transform(X[X.columns])

In [None]:
X = np.array(X)

In [None]:
Z = linkage(X, 'single')

In [None]:
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

c, coph_dists = cophenet(Z, pdist(X))
c

In [None]:
X = train[['square_feet', 'propertycountylandusecode', 'garages', 'has_pool']]

kmeans = KMeans(n_clusters=4)
kmeans.fit(X)

kmeans.predict(X)

train['extras_cluster'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
plt.rcParams["figure.figsize"] = (15,8)

In [None]:
bins = [-.6, -.0475, -.0244, -.0106, 0, .0125, .0283, .0586, .6]
labels = ['N4','N3','N2','N1', 'P1', 'P2', 'P3', 'P4']
train['log_error_bin'] = pd.cut(train['log_error'], bins=bins, labels=labels)
validate['log_error_bin'] = pd.cut(validate['log_error'], bins=bins, labels=labels)
test['log_error_bin'] = pd.cut(test['log_error'], bins=bins, labels=labels)

In [None]:
plt.figure(figsize=(9, 6))
pd.crosstab(train['room_cluster'],train['log_error_bin'], 
             normalize='index').plot.bar(stacked=True)
plt.show()

In [None]:
plt.figure(figsize=(9, 6))
pd.crosstab(train['propertycountylandusecode'],train['log_error_bin'], 
             normalize='index').plot.bar(stacked=True)
plt.show()

In [None]:
train_neg = train.where(train.log_error < 0)
train_pos = train.where(train.log_error > 0)
train_neg = train_neg.dropna()
train_pos = train_pos.dropna()

In [None]:
sns.scatterplot(data=train, x='longitude', y='latitude', hue='propertycountylandusecode')

In [None]:
bins = [0, 1214, 1497, 1850, 2430, 5600]
labels = [1,2,3,4,5]
train['square_feet_bin'] = pd.cut(train['square_feet'], bins=bins, labels=labels)
validate['square_feet_bin'] = pd.cut(validate['square_feet'], bins=bins, labels=labels)
test['square_feet_bin'] = pd.cut(test['square_feet'], bins=bins, labels=labels)

In [None]:
bins = [1900, 1930, 1950, 1970, 2000, 2020]
labels = labels = [1,2,3,4,5]
train['year_bin'] = pd.cut(train['year_built'], bins=bins, labels=labels)
validate['year_bin'] = pd.cut(train['year_built'], bins=bins, labels=labels)
test['year_bin'] = pd.cut(train['year_built'], bins=bins, labels=labels)

In [None]:
plt.figure(figsize=(9, 6))
pd.crosstab(train['year_bin'],train['propertycountylandusecode'], 
             normalize='index').plot.bar(stacked=True)
plt.show()

In [None]:
train['garages'] = train.garages.astype(bool)
train['has_pool'] = train.has_pool.astype(bool)

In [None]:
train['garages'] = train.garages.astype(int)
train['has_pool'] = train.has_pool.astype(int)

In [None]:
train['extras'] = train['garages'] + train['has_pool']

In [None]:
train.extras.value_counts()

In [None]:
X = train[['latitude', 'longitude']]

with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
X = train[['latitude', 'longitude']]
XV = validate[['latitude', 'longitude']]
XT = test[['latitude', 'longitude']]

kmeans = KMeans(n_clusters=20)
kmeans.fit(X)

kmeans.predict(X)

train['cluster_locs'] = kmeans.predict(X)
validate['cluster_locs'] = kmeans.predict(XV)
test['cluster_locs'] = kmeans.predict(XT)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
train.cluster_locs.head()

In [None]:
sns.scatterplot(data=train, x='longitude', y='latitude', hue='cluster_locs')

In [None]:
bins = [500, 160726, 299498, 445086, 667000, 3000000]
labels = [1,2,3,4,5]
train['value_bin'] = pd.cut(train['taxvaluedollarcnt'], bins=bins, labels=labels)
validate['value_bin'] = pd.cut(train['taxvaluedollarcnt'], bins=bins, labels=labels)
test['value_bin'] = pd.cut(train['taxvaluedollarcnt'], bins=bins, labels=labels)

In [None]:
train.value_bin.value_counts()

In [None]:
bins = [0, 5272, 6299, 7368, 9518, 56700, 100000000]
labels = [1,2,3,4,5, 6]
train['lot_size_bin'] = pd.cut(train['lot_size'], bins=bins, labels=labels)
validate['lot_size_bin'] = pd.cut(validate['lot_size'], bins=bins, labels=labels)
test['lot_size_bin'] = pd.cut(test['lot_size'], bins=bins, labels=labels)

In [None]:
train.landtaxvaluedollarcnt.quantile(1)

In [None]:
bins = [0, 54240, 159758, 279118, 448013, 3817215]
labels = [1,2,3,4,5]
train['lot_value_bin'] = pd.cut(train['landtaxvaluedollarcnt'], bins=bins, labels=labels)
validate['lot_value_bin'] = pd.cut(validate['landtaxvaluedollarcnt'], bins=bins, labels=labels)
test['lot_value_bin'] = pd.cut(test['landtaxvaluedollarcnt'], bins=bins, labels=labels)

In [None]:
bins = [0, 1214, 1497, 1850, 2430, 5600]
labels = [1,2,3,4,5]
train['square_feet_bin'] = pd.cut(train['square_feet'], bins=bins, labels=labels)
validate['square_feet_bin'] = pd.cut(validate['square_feet'], bins=bins, labels=labels)
test['square_feet_bin'] = pd.cut(test['square_feet'], bins=bins, labels=labels)

In [None]:
bins = [0, 5272, 6299, 7368, 9518, 56700, 100000000]
labels = [1,2,3,4,5,6]
train['lot_size_bin'] = pd.cut(train['lot_size'], bins=bins, labels=labels)
validate['lot_size_bin'] = pd.cut(validate['lot_size'], bins=bins, labels=labels)
test['lot_size_bin'] = pd.cut(test['lot_size'], bins=bins, labels=labels)

In [None]:
train['lot_size_bin'] = train['lot_size_bin'].astype(int)
train['square_feet_bin'] = train['square_feet_bin'].astype(int)

In [None]:
train['add_size'] = train['lot_size'] + train['square_feet']

In [None]:
(train.log_error).where(train.add_size == 2).mean()

In [None]:
train.info()

In [None]:
train['year_bin'] = train.year_bin.astype(int)

In [None]:
train['value_bin'] = train.value_bin.astype(int)

In [None]:
train['lot_value_bin'] = train.lot_value_bin.astype(int)

In [None]:
train['code'] = train.propertycountylandusecode.astype(int)

In [None]:
train.info()

In [None]:
#X_train = train[['taxvaluedollarcnt', 'landtaxvaluedollarcnt', 'square_feet', 'lot_size', 'year_built', 'room_cluster', 'square_feet_bin', 'year_bin', 'cluster_locs','extras', 'code','lot_size_bin', 'add_size','value_bin', 'lot_value_bin']]
X_train = train[['garages', 'has_pool','code', 'year_built', 'taxvaluedollarcnt', 'landtaxvaluedollarcnt', 'extras', 'cluster_locs', 'add_size', 'extras_cluster']]
y_train = train.log_error_bin

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = DecisionTreeClassifier()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=3)

# fit the data using RFE
rfe.fit(X_train,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [None]:
rfe_feature

In [None]:
clf = DecisionTreeClassifier(max_depth=4, random_state=5)
clf = clf.fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [None]:
train_neg = train.where(train.log_error < 0)
train_pos = train.where(train.log_error > 0)

In [None]:
train_neg= train_neg.dropna()
train_pos = train_pos.dropna()

In [None]:
plt.figure(figsize=(10, 4))
# sets the size of the graph
plt.title("Feature Correlation (contiuous variables) of to Log Error")
corr_chart = train.drop(['log_error'], axis=1).corrwith(train['log_error']).sort_values().plot.barh()
corr_chart