In [30]:
import numpy as np
import pandas as pd
import wrangle
import matplotlib.pyplot as plt
from scipy import stats
import sklearn.preprocessing
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


## Bring in zillow data

In [31]:
df = wrangle.wrangle_zillow()

In [33]:
df.head(5)

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
4,4,2.0,3633.0,296425.0,2005,6941.39,6037
6,3,4.0,1620.0,847770.0,2011,10244.94,6037
7,3,2.0,2077.0,646760.0,1926,7924.68,6037
11,0,0.0,1200.0,5328.0,1972,91.6,6037
14,0,0.0,171.0,6920.0,1973,255.17,6037


In [35]:
df.taxvaluedollarcnt.mean()

398621.43733669183

In [37]:
df['baseline_mean'] = df.taxvaluedollarcnt.mean()

In [38]:
df.head(1)

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,baseline_mean
4,4,2.0,3633.0,296425.0,2005,6941.39,6037,398621.437337


## Split Dataset

In [49]:
zillow_train, zillow_test = train_test_split(df, test_size=.2, random_state=123)
zillow_train, zillow_validate = train_test_split(zillow_train, test_size=.3, random_state=123)

## Scaling the data set (MinMax)

In [50]:
scaler = sklearn.preprocessing.MinMaxScaler()
# Note that we only call .fit with the training data,
# but we use .transform to apply the scaling to all the data splits.
scaler.fit(zillow_train)

zillow_train_scaled = scaler.transform(zillow_train)
zillow_validate_scaled = scaler.transform(zillow_validate)
zillow_test_scaled = scaler.transform(zillow_test)

In [51]:
# modeling methods

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

In [52]:
pd.DataFrame(zillow_train_scaled)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.500000,0.333333,0.002916,0.178516,0.744186,0.047274,0.000000,0.0
1,0.500000,0.583333,0.003387,0.143884,0.906977,0.029884,0.297297,0.0
2,0.500000,0.333333,0.002901,0.294375,0.567442,0.077232,0.000000,0.0
3,0.500000,0.166667,0.002122,0.145489,0.702326,0.038724,0.000000,0.0
4,0.000000,0.166667,0.000744,0.017260,0.655814,0.006076,0.000000,0.0
...,...,...,...,...,...,...,...,...
1172961,0.500000,0.416667,0.004630,0.231666,0.767442,0.055477,0.297297,0.0
1172962,0.333333,0.250000,0.002241,0.258892,0.865116,0.057308,0.297297,0.0
1172963,0.500000,0.333333,0.003302,0.239030,0.665116,0.061935,0.000000,0.0
1172964,0.500000,0.166667,0.002235,0.036479,0.679070,0.005036,0.000000,0.0


In [53]:
zillow_train_scaled = pd.DataFrame(zillow_train_scaled, columns = zillow_train.columns.values)

In [54]:
pd.DataFrame(zillow_train_scaled.set_index([zillow_train.index.values]))

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,baseline_mean
421888,0.500000,0.333333,0.002916,0.178516,0.744186,0.047274,0.000000,0.0
548087,0.500000,0.583333,0.003387,0.143884,0.906977,0.029884,0.297297,0.0
714717,0.500000,0.333333,0.002901,0.294375,0.567442,0.077232,0.000000,0.0
749704,0.500000,0.166667,0.002122,0.145489,0.702326,0.038724,0.000000,0.0
539577,0.000000,0.166667,0.000744,0.017260,0.655814,0.006076,0.000000,0.0
...,...,...,...,...,...,...,...,...
83961,0.500000,0.416667,0.004630,0.231666,0.767442,0.055477,0.297297,0.0
828457,0.333333,0.250000,0.002241,0.258892,0.865116,0.057308,0.297297,0.0
722901,0.500000,0.333333,0.003302,0.239030,0.665116,0.061935,0.000000,0.0
821001,0.500000,0.166667,0.002235,0.036479,0.679070,0.005036,0.000000,0.0


## Create baseline prediction of home value by using the mean

In [55]:
predictions = pd.DataFrame({
    'actual': zillow_train_scaled.taxvaluedollarcnt
}) 

In [46]:
predictions.head()

Unnamed: 0,actual
0,0.178516
1,0.143884
2,0.294375
3,0.145489
4,0.01726


In [56]:
predictions['baseline_mean'] = zillow_train_scaled.baseline_mean

## Simple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

# X must be 2-d
X_train = zillow_train_scaled[['calculatedfinishedsquarefeet']]
# y can be 1-d
y_train = zillow_train_scaled.taxvaluedollarcnt

# 1. make the thing
lm = LinearRegression()
# 2. fit the thing
lm.fit(X_train, y_train)
# 3. use the thing (make predictions)
predictions['simple_lm'] = lm.predict(X_validate)