# California Housing Data Frame

## Import Libraries

In [0]:
! pip install memory_profiler

Collecting memory_profiler
[?25l  Downloading https://files.pythonhosted.org/packages/9f/fe/1fca7273dd111108f204a686b12a12b6422d405fe4614087aa7d5a66ea87/memory_profiler-0.55.0.tar.gz (40kB)
[K    100% |████████████████████████████████| 40kB 3.2MB/s 
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/f0/ff/63/fdbff3f1e1b76ad4eae491dd5b190902906b093e93eb86dd5a
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.55.0


In [0]:
%load_ext memory_profiler

In [0]:
! apt-get install default-jre
! java -version
! pip install h2o

Reading package lists... Done
Building dependency tree       
Reading state information... Done
default-jre is already the newest version (2:1.10-63ubuntu1~02).
default-jre set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.
openjdk version "10.0.2" 2018-07-17
OpenJDK Runtime Environment (build 10.0.2+13-Ubuntu-1ubuntu0.18.04.4)
OpenJDK 64-Bit Server VM (build 10.0.2+13-Ubuntu-1ubuntu0.18.04.4, mixed mode)
Collecting h2o
[?25l  Downloading https://files.pythonhosted.org/packages/2a/00/87db2f4c0a8797c0eddb122b7d99b6a007e59659b2eebf027ef28b75e2ce/h2o-3.22.1.3.tar.gz (120.9MB)
[K    100% |████████████████████████████████| 120.9MB 188kB/s 
Collecting colorama>=0.3.8 (from h2o)
  Downloading https://files.pythonhosted.org/packages/4f/a6/728666f39bfff1719fc94c481890b2106837da9318031f71a8424b662e12/colorama-0.4.1-py2.py3-none-any.whl
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25ldone
[?25h  Stored in director

In [0]:
from IPython import display
import h2o
from sklearn.preprocessing import MinMaxScaler
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

# Pretty Display of Variables
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [0]:
def process_california_data(ratios=[0.8, 0.1]):
    """ Downloads the california housing dataset, preprocess the data and splits it.
    Args:
        ratio: Split ratio. Default is 0.8
    
    """

    print('Downloading data from web..')
    # Load training & test data set.
    train_df = h2o.import_file("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")
    test_df = h2o.import_file("https://download.mlcc.google.com/mledu-datasets/california_housing_test.csv", sep=",")

    print('Merging data..')
    # Merge data
    df = train_df.rbind(test_df)
    
    print('Creating a synthetic feature..')
    # Create a synthetic feature.
    df["rooms_per_person"] = df["total_rooms"] / df["population"]
    
    # Scale the huge value columns
    df["median_house_value"] = df["median_house_value"] / 1000 

    # TODO heatmap and scaling
    # print('Heatmap of our data..')
    # sample = df.sample(n=500)
    # sns.heatmap(
    #     sample.corr(), 
    #     xticklabels=sample.columns.values,
    #     yticklabels=sample.columns.values,
    #     fmt=".2f",
    #     annot=True
    # )
    # print('Scaling the data between 0-1..')
    # Scale the data between 0-1
    # scaler = MinMaxScaler(feature_range=(0,1))
    # scale_columns = list(df.columns[2:])
    # scaled = df.copy()
    # scaled[scale_columns] = scaler.fit_transform(df[scale_columns])
    # print("Data was multiplied by {:.6f} and added {:.4f}".format(scaler.scale_[0], scaler.min_[0]))


    print('Splitting train & test frames..')
    # Split
    train, test,val = housing_df.split_frame(ratios=ratios, seed=1234)

    return train, test, val

In [3]:
# Init h2o
h2o.init(max_mem_size="4G")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,1 hour 11 mins
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.3
H2O cluster version age:,11 days
H2O cluster name:,H2O_from_python_unknownUser_5650ob
H2O cluster total nodes:,1
H2O cluster free memory:,3.999 Gb
H2O cluster total cores:,2
H2O cluster allowed cores:,2


In [0]:
train, test, val = process_california_data()

In [7]:
train.head(10)

longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person
-114.31,34.19,15,5612,1283,1015,472,1.4936,66.9,5.52906
-114.47,34.4,19,7650,1901,1129,463,1.82,80.1,6.77591
-114.56,33.69,17,720,174,333,117,1.6509,85.7,2.16216
-114.57,33.57,20,1454,326,624,262,1.925,65.5,2.33013
-114.58,33.63,29,1387,236,671,239,3.3438,74.0,2.06706
-114.58,33.61,25,2907,680,1841,633,2.6768,82.4,1.57903
-114.59,34.83,41,812,168,375,158,1.7083,48.5,2.16533
-114.59,33.61,34,4789,1175,3134,1056,2.1782,58.4,1.52808
-114.6,33.62,16,3741,801,2434,824,2.6797,86.5,1.53698
-114.6,33.6,21,1988,483,1182,437,1.625,62.0,1.6819




In [8]:
train.describe()

Rows:13675
Cols:10




Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person
type,real,real,int,int,int,int,int,real,real,real
mins,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14.999,0.06160540478644223
mean,-119.56514076782449,35.62287531992688,28.58925045703837,2640.7539305301634,538.915831809872,1429.366435100548,500.90552102376597,3.885577447897624,207.52250522851926,1.9727519004634642
maxs,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500.001,55.22222222222222
sigma,2.005470484006647,2.134008320634275,12.540342446116682,2184.3680094648566,422.2657120666202,1156.9706725351596,384.4741601721413,1.910248644531606,116.19922215068665,1.0811994347126375
zeros,0,0,0,0,0,0,0,0,0,0
missing,0,0,0,0,0,0,0,0,0,0
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66.9,5.529064039408867
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80.1,6.775907883082374
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85.7,2.1621621621621623


##  Linear Regression with One Variable

In [0]:
feature_columns = ["median_income"]
label = "median_house_value"

In [0]:
# Model parameters
model = H2OGeneralizedLinearEstimator(model_id="glm_v1", family="gaussian", solver="AUTO")

In [0]:
model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [0]:
model

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  glm_v1


ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 7018.543292972757
RMSE: 83.77674673185129
MAE: 62.99113694302462
RMSLE: 0.42698214193133016
R^2: 0.4801568863004617
Mean Residual Deviance: 7018.543292972757
Null degrees of freedom: 13674
Residual degrees of freedom: 13673
Null deviance: 184629895.06267542
Residual deviance: 95978579.53140245
AIC: 159924.02136184403

ModelMetricsRegressionGLM: glm
** Reported on validation data. **

MSE: 7163.14474386002
RMSE: 84.63536343550501
MAE: 64.18785312737468
RMSLE: 0.44132681824748626
R^2: 0.4665983600889778
Mean Residual Deviance: 7163.14474386002
Null degrees of freedom: 1687
Residual degrees of freedom: 1686
Null deviance: 22668706.748815075
Residual deviance: 12091388.327635713
AIC: 19780.213470679246
Scoring History: 


0,1,2,3,4,5
,timestamp,duration,iterations,negative_log_likelihood,objective
,2019-02-05 08:27:03,0.000 sec,0,184629892.6894789,13501.2718603




In [0]:
%memit model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%
peak memory: 152.80 MiB, increment: 0.01 MiB


In [0]:
%time model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 17.7 ms, sys: 4.34 ms, total: 22 ms
Wall time: 258 ms


In [0]:
%prun model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%
 

**Results**


**%memit**  
peak memory: 152.80 MiB, increment: 0.01 MiB

  
**%time**  
CPU times: user 17.7 ms, sys: 4.34 ms, total: 22 ms
Wall time: 258 ms

**%prun**  
18131 function calls (17752 primitive calls) in 0.266 seconds

## Linear Regression with Multiple Variables

In [0]:
feature_columns = ["median_income", "rooms_per_person", "total_rooms", "housing_median_age"]
label = "median_house_value"

In [0]:
# Model parameters
model = H2OGeneralizedLinearEstimator(model_id="glm_v2", family="gaussian", solver="AUTO")

In [0]:
model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [0]:
model

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  glm_v2


ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 6447.897148355625
RMSE: 80.29879917131778
MAE: 60.33878073578241
RMSLE: 0.4178979782230026
R^2: 0.5224229885749068
Mean Residual Deviance: 6447.897148355625
Null degrees of freedom: 13674
Residual degrees of freedom: 13670
Null deviance: 184629895.06267542
Residual deviance: 88174993.50376317
AIC: 158770.359013589

ModelMetricsRegressionGLM: glm
** Reported on validation data. **

MSE: 6551.366488861443
RMSE: 80.94051203730702
MAE: 61.52839840428638
RMSLE: 0.4320290912054075
R^2: 0.5121542627192934
Mean Residual Deviance: 6551.366488861443
Null degrees of freedom: 1687
Residual degrees of freedom: 1683
Null deviance: 22668706.748815075
Residual deviance: 11058706.633198116
AIC: 19635.516523836184
Scoring History: 


0,1,2,3,4,5
,timestamp,duration,iterations,negative_log_likelihood,objective
,2019-02-05 08:59:01,0.000 sec,0,184629892.6894789,13501.2718603




In [0]:
%memit model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%
peak memory: 153.76 MiB, increment: 0.00 MiB


In [0]:
%time model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 19.9 ms, sys: 3.58 ms, total: 23.5 ms
Wall time: 263 ms


In [0]:
%prun model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%
 

**Results**




**%memit**  
peak memory: 153.76 MiB, increment: 0.00 MiB

  
**%time**  
CPU times: user 19.9 ms, sys: 3.58 ms, total: 23.5 ms
Wall time: 263 ms

**%prun**  
18098 function calls (17698 primitive calls) in 0.281 seconds



## Deep Learning

In [0]:
feature_columns = ["median_income", "rooms_per_person", "total_rooms", "housing_median_age"]
label = "median_house_value"

In [0]:
# Model parameters
model = H2ODeepLearningEstimator(model_id="nn_v1", distribution="AUTO", epochs=5, hidden=[50,100,50])

In [11]:
model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [12]:
model

Model Details
H2ODeepLearningEstimator :  Deep Learning
Model Key:  nn_v1


ModelMetricsRegression: deeplearning
** Reported on train data. **

MSE: 5940.423273628144
RMSE: 77.0741414070124
MAE: 54.49455090761387
RMSLE: 0.38416147656709776
Mean Residual Deviance: 5940.423273628144

ModelMetricsRegression: deeplearning
** Reported on validation data. **

MSE: 6192.620504177109
RMSE: 78.6932049428482
MAE: 56.685540028625425
RMSLE: 0.40430707785302045
Mean Residual Deviance: 6192.620504177109
Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_deviance,training_mae,training_r2,validation_rmse,validation_deviance,validation_mae,validation_r2
,2019-02-06 13:05:17,0.000 sec,,0.0,0,0.0,,,,,,,,
,2019-02-06 13:05:18,2.089 sec,4819 obs/sec,0.4983547,1,6815.0,93.2830368,8701.7249572,66.2950229,0.3523488,94.6029820,8949.7242076,67.6131872,0.3335612
,2019-02-06 13:05:23,7.055 sec,10106 obs/sec,4.4801463,9,61266.0,78.6967766,6193.1826401,54.5404137,0.5390543,80.2845035,6445.6014969,57.0535658,0.5200300
,2019-02-06 13:05:25,8.204 sec,10778 obs/sec,5.4772943,11,74902.0,77.0741414,5940.4232736,54.4945509,0.5578667,78.6932049,6192.6205042,56.6855400,0.5388682


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
median_income,1.0,1.0,0.3384266
rooms_per_person,0.7546721,0.7546721,0.2554012
total_rooms,0.6356176,0.6356176,0.2151099
housing_median_age,0.5645604,0.5645604,0.1910623




In [13]:
%memit model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

UsageError: Line magic function `%memit` not found.


In [14]:
%time model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

deeplearning Model Build progress: |██████████████████████████████████████| 100%
CPU times: user 219 ms, sys: 23 ms, total: 242 ms
Wall time: 7.95 s


In [15]:
%prun model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

deeplearning Model Build progress: |██████████████████████████████████████| 100%
 

**Results**

**%memit**  
peak memory: 153.82 MiB, increment: 0.00 MiB

**%time**  
CPU times: user 219 ms, sys: 23 ms, total: 242 ms
Wall time: 7.95 s  

**%prun**  
122046 function calls (119252 primitive calls) in 8.299 seconds

