# Modeling MVP

## Pre-Processing

In [1]:
import acquire as ac
import prepare as prep
import evaluate as ev

import pandas as pd
import numpy as np

# For modeling
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
zil = ac.zillow_data()

In [3]:
zil.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,4.0,2.0,3633.0,296425.0
1,3.0,4.0,1620.0,847770.0
2,3.0,2.0,2077.0,646760.0
3,0.0,0.0,1200.0,5328.0
4,0.0,0.0,171.0,6920.0


In [4]:
zil = prep.prep_zillow(zil)

In [5]:
train, val, test = prep.scale(zil)

train.shape, val.shape, test.shape

((39013, 4), (8360, 4), (8360, 4))

In [6]:
train.head()

Unnamed: 0,bedrooms,bathrooms,sq_ft,price
13867,2,1.0,0.174067,404049.0
20306,3,2.0,0.167236,538982.0
48331,2,1.0,0.146237,360751.0
13416,2,1.0,0.091335,176663.0
2951,4,3.0,0.355218,3113999.0


In [7]:
pd.get_dummies(train, columns=['bedrooms', 'bathrooms']).head(3)

Unnamed: 0,sq_ft,price,bedrooms_0,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,bedrooms_6,bedrooms_7,...,bathrooms_2.0,bathrooms_2.5,bathrooms_3.0,bathrooms_3.5,bathrooms_4.0,bathrooms_4.5,bathrooms_5.0,bathrooms_5.5,bathrooms_6.0,bathrooms_7.0
13867,0.174067,404049.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20306,0.167236,538982.0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
48331,0.146237,360751.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
train = pd.get_dummies(train, columns=['bedrooms', 'bathrooms'])
val = pd.get_dummies(val, columns=['bedrooms', 'bathrooms'])
test = pd.get_dummies(test, columns=['bedrooms', 'bathrooms'])

In [9]:
train.shape, val.shape, test.shape

((39013, 23), (8360, 23), (8360, 23))

TO DO:

1. Data is split into train val test
2. Data is transformed with pd.getdummies
3. 

In [10]:
train.head()

Unnamed: 0,sq_ft,price,bedrooms_0,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,bedrooms_6,bedrooms_7,...,bathrooms_2.0,bathrooms_2.5,bathrooms_3.0,bathrooms_3.5,bathrooms_4.0,bathrooms_4.5,bathrooms_5.0,bathrooms_5.5,bathrooms_6.0,bathrooms_7.0
13867,0.174067,404049.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20306,0.167236,538982.0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
48331,0.146237,360751.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13416,0.091335,176663.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2951,0.355218,3113999.0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [11]:
x_train, y_train = prep.split_xy(train, 'price')
x_val, y_val = prep.split_xy(val, 'price')
x_test, y_test = prep.split_xy(test, 'price')

## Baseline Model

In [12]:
results = pd.DataFrame(y_train).copy()
results = ev.baseline(results, 'price')

In [13]:
results.head()

Unnamed: 0,price,base_median,base_mean
13867,404049.0,327477.0,443516.048907
20306,538982.0,327477.0,443516.048907
48331,360751.0,327477.0,443516.048907
13416,176663.0,327477.0,443516.048907
2951,3113999.0,327477.0,443516.048907


In [14]:
SSE, MSE, RMSE = ev.eval_model(results.price, results.base_median)

In [15]:
'''def eval_model(data, actual='', model=''):
     88     df = data.copy
---> 89     df['m_res'] = df[model] - df[actual]
     90     SSE = (df['m_res'] ** 2).sum()
     91     MSE = SSE / len(data)'''

"def eval_model(data, actual='', model=''):\n     88     df = data.copy\n---> 89     df['m_res'] = df[model] - df[actual]\n     90     SSE = (df['m_res'] ** 2).sum()\n     91     MSE = SSE / len(data)"

In [16]:
SSE, MSE, round(RMSE,2) 

(1.043160806276456e+16, 267388000480.98224, 517095.74)

In [17]:
SSE, MSE, RMSE = ev.eval_model(results.price, results.base_mean)

In [18]:
SSE, MSE, round(RMSE,2) 

(9906295642996870.0, 253922939609.7934, 503907.67)

The better baseline was using the mean which had an RMSE of 500,000. Pretty bad.

In [19]:
RMSE

503907.6697270973

## Linear Regression

In [20]:
lm = LinearRegression()
ev.train_model(lm, x_train, y_train, x_val, y_val)

The train RMSE is 379751.97851383884.
The validate RMSE is 456022.16743018775.


## Lasso Lars

In [21]:
ll = LassoLars(alpha=0)
ev.train_model(ll, x_train, y_train, x_val, y_val)

The train RMSE is 379614.5546085301.
The validate RMSE is 456234.77070727036.


In [22]:
ll = LassoLars(alpha=1)
ev.train_model(ll, x_train, y_train, x_val, y_val)

The train RMSE is 379614.5641167404.
The validate RMSE is 456236.44237606437.


## PolynomialFeatures

In [23]:
poly = PolynomialFeatures()
x_train_s = poly.fit_transform(x_train)
x_val_s = poly.transform(x_val)

In [24]:
lm = LinearRegression()
ev.train_model(lm, x_train_s, y_train, x_val_s, y_val)

The train RMSE is 374671.4035410802.
The validate RMSE is 450123.0479677399.


## Tweedie

In [25]:
tweedie = TweedieRegressor()

ev.train_model(tweedie, x_train_s, y_train, x_val_s, y_val)

The train RMSE is 480385.85163921217.
The validate RMSE is 552265.5081923254.


## Random Forest Regressor

In [26]:
rf = RandomForestRegressor()

ev.train_model(rf, x_train, y_train, x_val, y_val)
print()
ev.train_model(rf, x_train_s, y_train, x_val_s, y_val)

The train RMSE is 233002.17229373613.
The validate RMSE is 485586.6178623238.

The train RMSE is 234788.1988016254.
The validate RMSE is 487244.60506115895.


## XGBRegressor

In [47]:
xgbr = XGBRegressor()

ev.train_model(xgbr, x_train, y_train, x_val, y_val)

The train RMSE is 264355.33652801154.
The validate RMSE is 429345.90345456084.


ROUND 1: Conclusions. These models suck. I will try using min maxed data for bathroom and bedroom as numeric.

# Second Try

In [28]:
zil = ac.zillow_data()
zil = prep.prep_zillow(zil, mvp=True)
train, val, test = prep.scale(zil, scaled_cols=['bedrooms', 'bathrooms', 'sq_ft'])

x_train, y_train = prep.split_xy(train, 'price')
x_val, y_val = prep.split_xy(val, 'price')
x_test, y_test = prep.split_xy(test, 'price')

In [29]:
lm = LinearRegression()
ev.train_model(lm, x_train, y_train, x_val, y_val)

The train RMSE is 389752.8517363422.
The validate RMSE is 465644.660547253.


In [30]:
ll = LassoLars(alpha=0)
ev.train_model(ll, x_train, y_train, x_val, y_val)

The train RMSE is 389752.8517363422.
The validate RMSE is 465644.660547253.


In [31]:
ll = LassoLars(alpha=1)
ev.train_model(ll, x_train, y_train, x_val, y_val)

The train RMSE is 389752.8522221678.
The validate RMSE is 465645.26573188027.


In [32]:
poly = PolynomialFeatures()
x_train_s = poly.fit_transform(x_train)
x_val_s = poly.transform(x_val)

lm = LinearRegression()
ev.train_model(lm, x_train_s, y_train, x_val_s, y_val)

The train RMSE is 376939.4056023681.
The validate RMSE is 450083.09070667933.


In [33]:
tweedie = TweedieRegressor()

ev.train_model(tweedie, x_train_s, y_train, x_val_s, y_val)

The train RMSE is 491151.14386093384.
The validate RMSE is 561782.2861085506.


In [34]:
rf = RandomForestRegressor(max_depth=5, min_samples_split=5, random_state=100)

ev.train_model(rf, x_train, y_train, x_val, y_val)
print()
ev.train_model(rf, x_train_s, y_train, x_val_s, y_val)

The train RMSE is 369160.3857479571.
The validate RMSE is 455338.4519403009.

The train RMSE is 364551.19135544065.
The validate RMSE is 451754.30070202367.


In [35]:
xgbr = XGBRegressor(max_depth=6, n_estimators=10, random_state=100)

ev.train_model(xgbr, x_train, y_train, x_val, y_val)

The train RMSE is 351683.49763802794.
The validate RMSE is 456447.3480018607.


TESTING

In [36]:
zil = ac.zillow_data()

In [37]:
rename = {'bedroomcnt': 'bedrooms',  # Create a dictionary for new column names
                  'bathroomcnt': 'bathrooms',
                  'calculatedfinishedsquarefeet': 'sq_ft',
                  'taxvaluedollarcnt': 'price'}
zil = zil.rename(columns=rename)  # Rename colums using dictionary

In [38]:
train, val, test = prep.scale(zil, scaled_cols=['bedrooms', 'bathrooms', 'sq_ft'])

x_train, y_train = prep.split_xy(train, 'price')
x_val, y_val = prep.split_xy(val, 'price')
x_test, y_test = prep.split_xy(test, 'price')

In [39]:
lm = LinearRegression()
ev.train_model(lm, x_train, y_train, x_val, y_val)

The train RMSE is 448798.8824160345.
The validate RMSE is 409499.92646544025.


In [40]:
ll = LassoLars(alpha=1)
ev.train_model(ll, x_train, y_train, x_val, y_val)

The train RMSE is 448798.886145746.
The validate RMSE is 409499.66675874504.


In [41]:
x_train

Unnamed: 0,bedrooms,bathrooms,sq_ft
50317,0.20,0.150,0.111596
45686,0.12,0.050,0.040660
10067,0.20,0.125,0.104051
44901,0.16,0.150,0.105619
1705,0.12,0.100,0.060844
...,...,...,...
16304,0.12,0.050,0.047470
79,0.12,0.100,0.051242
12119,0.08,0.100,0.080586
14147,0.08,0.050,0.054279


In [53]:
def prep_zillow(df, mvp=True):

    """ This function will take in the zillow dataset and prepare it by renaming the columns, binning some of the
    data and removing some of the outliers from the dataset."""

    if mvp:
        zil = df.copy()
        rename = {'bedroomcnt': 'bedrooms',  # Create a dictionary for new column names
                  'bathroomcnt': 'bathrooms',
                  'calculatedfinishedsquarefeet': 'sq_ft',
                  'taxvaluedollarcnt': 'price'}
        zil = zil.rename(columns=rename)  # Rename colums using dictionary
        bed_bins = pd.cut(zil.bedrooms, bins=[-0.5, .5, 1.5, 2.5, 3.5, 4.5, 5.5, 1000],
                          labels=[0, 1, 2, 3, 4, 5, 6])
        zil.bedrooms = bed_bins.astype(int)
        new_baths = np.where(zil.bathrooms > 3, 4, zil.bathrooms)
        zil.bathrooms = new_baths
        squares = np.where(zil.sq_ft > 4000, 4001, zil.sq_ft)
        zil.sq_ft = squares
        return zil  # Return zil dataframe

In [66]:
zil = ac.zillow_data()
zil = prep_zillow(zil)

train, val, test = prep.scale(zil, scaled_cols=['bedrooms', 'bathrooms', 'sq_ft'])
# train = train[train.price < 15000_000]

x_train, y_train = prep.split_xy(train, 'price')
x_val, y_val = prep.split_xy(val, 'price')
x_test, y_test = prep.split_xy(test, 'price')

In [69]:
xgbr = XGBRegressor()

ev.train_model(xgbr, x_train, y_train, x_val, y_val)

The train RMSE is 437894.69086208945.
The validate RMSE is 420180.79140511603.


# MVP Conclusions

Most models perform about the same. They do better than baseline.