# Feature Sweeps
Warning: do not run this notebook unless you have at least an hour on your hands.

In [1]:
import os
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2
import modeling

## Unscaled Dataset

In [3]:
df = pd.read_pickle(os.path.join("data", "scrubbed_kc_house_data.pkl.bz2"))
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21420 entries, 7129300520 to 1523300157
Data columns (total 27 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   price          21420 non-null  float64 
 1   bedrooms       21420 non-null  int64   
 2   bathrooms      21420 non-null  float64 
 3   sqft_living    21420 non-null  int64   
 4   sqft_lot       21420 non-null  int64   
 5   floors         21420 non-null  float64 
 6   waterfront     21420 non-null  category
 7   view           21420 non-null  int64   
 8   condition      21420 non-null  int64   
 9   grade          21420 non-null  int64   
 10  sqft_above     21420 non-null  int64   
 11  zipcode        21420 non-null  category
 12  sqft_living15  21420 non-null  int64   
 13  sqft_lot15     21420 non-null  int64   
 14  cat_bedrooms   21420 non-null  category
 15  cat_bathrooms  21420 non-null  category
 16  cat_condition  21420 non-null  category
 17  cat_grade      21

I exclude variables that have poor linear relationships with price. I also exclude "cat_bathrooms", because it often results in coefficients with bad P-values.

In [4]:
df.drop(columns=["lot_area", "lot15_area", "age", "cat_age", "zip_pop", "cat_condition", "condition", "cat_bathrooms", "cat_floors", "sqft_lot15", "sqft_lot"], inplace=True)

In [5]:
modeling.ols_sweep(df, "price", n_vars=1)
modeling.consolidate_results("test_models/ols_sweep/price~1/*")

0:00:01.249614
0:00:00.060972


In [6]:
modeling.ols_sweep(df, "price", n_vars=2)
modeling.consolidate_results("test_models/ols_sweep/price~2/*")

0:00:12.243177
0:00:00.369810


In [7]:
modeling.ols_sweep(df, "price", n_vars=3)
modeling.consolidate_results("test_models/ols_sweep/price~3/*")

0:01:09.302377
0:00:01.408661


In [8]:
modeling.ols_sweep(df, "price", n_vars=4)
modeling.consolidate_results("test_models/ols_sweep/price~4/*")

0:04:10.209376
0:00:04.071029


In [9]:
modeling.ols_sweep(df, "price", n_vars=5)
modeling.consolidate_results("test_models/ols_sweep/price~5/*")

0:10:54.971237
0:00:09.036488


## Scaled Dataset

In [10]:
df = pd.read_pickle(os.path.join("data", "scaled_kc_house_data.pkl.bz2"))
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21420 entries, 7129300520 to 1523300157
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   price          21420 non-null  float64 
 1   bedrooms       21420 non-null  float64 
 2   bathrooms      21420 non-null  float64 
 3   sqft_living    21420 non-null  float64 
 4   sqft_lot       21420 non-null  float64 
 5   floors         21420 non-null  float64 
 6   waterfront     21420 non-null  category
 7   grade          21420 non-null  float64 
 8   sqft_above     21420 non-null  float64 
 9   zipcode        21420 non-null  category
 10  sqft_living15  21420 non-null  float64 
 11  sqft_lot15     21420 non-null  float64 
 12  cat_bedrooms   21420 non-null  category
 13  cat_bathrooms  21420 non-null  category
 14  cat_condition  21420 non-null  category
 15  cat_grade      21420 non-null  category
 16  cat_floors     21420 non-null  category
 17  cat_view       21

In [11]:
df.drop(columns=["lot_area", "lot15_area", "age", "cat_age" "zip_pop", "cat_floors", "cat_condition", "cat_bathrooms", "sqft_lot15", "sqft_lot"], inplace=True)

KeyError: "['cat_agezip_pop'] not found in axis"

In [26]:
scaled_dir = os.path.join("test_models", "ols_sweep_scaled")

In [27]:
modeling.ols_sweep(df, "price", n_vars=1, dst=scaled_dir)
modeling.consolidate_results("test_models/ols_sweep_scaled/price~1/*")

0:00:01.405419
0:00:00.054756


In [28]:
modeling.ols_sweep(df, "price", n_vars=2, dst=scaled_dir)
modeling.consolidate_results("test_models/ols_sweep_scaled/price~2/*")

0:00:15.809499
0:00:00.379313


In [29]:
modeling.ols_sweep(df, "price", n_vars=3, dst=scaled_dir)
modeling.consolidate_results("test_models/ols_sweep_scaled/price~3/*")

0:01:38.136824
0:00:01.701400


In [30]:
modeling.ols_sweep(df, "price", n_vars=4, dst=scaled_dir)
modeling.consolidate_results("test_models/ols_sweep_scaled/price~4/*")

0:06:12.363347
0:00:05.260165


In [31]:
modeling.ols_sweep(df, "price", n_vars=5, dst=scaled_dir)
modeling.consolidate_results("test_models/ols_sweep_scaled/price~5/*")

0:17:07.187111
0:00:13.050460
