In [1]:
from google.colab import drive

drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [2]:
!pip install h2o

Collecting h2o
[?25l  Downloading https://files.pythonhosted.org/packages/bb/55/494a42e7509d0874aa444ae4cad0bc7439c936b40e563550fbfabb540275/h2o-3.30.1.3.tar.gz (129.4MB)
[K     |████████████████████████████████| 129.4MB 87kB/s 
Collecting colorama>=0.3.8
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.30.1.3-py2.py3-none-any.whl size=129446676 sha256=5bd6394fe2976cfd918d38c125cda206d7c69f4a6bb4aa2c5b95c5ca225c07a0
  Stored in directory: /root/.cache/pip/wheels/3d/e6/07/53ce9be9cb61b33a79cb3ed1fc39f2dae84f6ee6fe1e373e5e
Successfully built h2o
Installing collected packages: colorama, h2o
Successfully installed colorama-0.4.4 h2o-3.30.1.3


In [3]:
import h2o
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from h2o.automl import H2OAutoML
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
global_seed = 42
np.random.seed = global_seed

In [5]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.9" 2020-10-20; OpenJDK Runtime Environment (build 11.0.9+11-Ubuntu-0ubuntu1.18.04.1); OpenJDK 64-Bit Server VM (build 11.0.9+11-Ubuntu-0ubuntu1.18.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.6/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpouqrtrg6
  JVM stdout: /tmp/tmpouqrtrg6/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpouqrtrg6/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,1 month and 5 days
H2O_cluster_name:,H2O_from_python_unknownUser_dwcby9
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.180 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [6]:
def featureSelect(depth, mode=0, check=False):
    """
    enter only 2/10/50 for depth
    """
    if depth == 2 or depth == 10:
        alt = "L1"
    elif depth == 50:
        alt = "L3"

    ntr = ["NTR"]
    ndvi = ["NDVI"]
    psd = [f"Clay_{alt}", f"Sand_{alt}", f"Silt_{alt}"]
    bd = [f"BD_{alt}_2018"]
    om = [f"OM_{alt}_2018"]
    thetaFC = [f"θFC_{alt}"]
    thetaPWP = [f"θPWP_{alt}"]
    ksat = [f"Ksat_{alt}"]
    porosity = [f"Porosity_{alt}_2018"]

    feature_combo = [
        ntr,  # 0
        ntr + ndvi,  # 1
        ntr + psd,  # 2
        ntr + ndvi + psd,  # 3
        ntr + ndvi + psd + bd,  # 4
        ntr + ndvi + psd + bd + om,  # 5
        ntr + ndvi + psd + bd + thetaFC,  # 6
        ntr + ndvi + psd + bd + thetaFC + thetaPWP,  # 7
        ntr + ndvi + thetaFC + thetaPWP,  # 8
        ntr + ndvi + psd + bd + om + thetaFC + thetaPWP,  # 9
        ntr + ndvi + psd + bd + om + thetaFC + thetaPWP + ksat,  # 10
        ntr + ndvi + psd + om + porosity,  # 11
    ]

    if check:
        print([f"{depth} cm"] + feature_combo[mode])

    return feature_combo[mode], f"{depth} cm"

In [7]:
df = pd.read_excel("/content/gdrive/My Drive/Linux Shared Folder/Soil moisture/Data_NEW_data_v2.xlsx")

In [8]:
df = df.sample(frac=1, random_state=global_seed).reset_index(drop=True)

In [9]:
def data_split(data, train_split, valid_split):
    n = data.shape[0]
    train = data.iloc[0:int(n * train_split)]
    valid = data.iloc[int(n * train_split):int(n * (train_split + valid_split ))]
    test = data.iloc[int(n * (train_split + valid_split )):n]

    train = h2o.H2OFrame(train)
    test = h2o.H2OFrame(test)
    valid = h2o.H2OFrame(valid)

    return train, test, valid

In [10]:
train, test, valid = data_split(data=df, train_split=0.5, valid_split=0.2)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [11]:
models = []

log = pd.DataFrame(columns=["depth", "mode", "RMSE", "R2", "y_pred", "y_test"])

In [12]:
count = 0

In [13]:
for d in [50]:
    for m in range(12):
        count += 1
        print(f"Pass {count}")

        x, y = featureSelect(d, m)
        x = ["2 cm", "10 cm"] + x

        print(x)

        aml = H2OAutoML(max_models=10, seed=global_seed, include_algos = ["DeepLearning"], sort_metric="RMSE", verbosity="info")
        aml.train(x=x, y=y, training_frame=train, validation_frame=valid)

        preds = aml.leader.predict(test)

        y_pred = preds.as_data_frame()
        y_test = test[y].as_data_frame()

        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        r2 = r2_score(y_test, y_pred)

        models.append(aml.leader)
        h2o.save_model(aml.leader, path=f"/content/gdrive/My Drive/Linux Shared Folder/Soil moisture/models/run_5_with_BOTH/data_{d}_{m}/")

        log = log.append(
            {
                "depth": d,
                "mode": m,
                "RMSE": rmse,
                "R2": r2,
                "y_pred": y_pred,
                "y_test": y_test,
            },
            ignore_index=True,
        )

        log.to_excel("/content/gdrive/My Drive/Linux Shared Folder/Soil moisture/models/run_5_with_BOTH/log_h2o_run_5_with_BOTH.xlsx")

Pass 1
['2 cm', '10 cm', 'NTR']
AutoML progress: |
01:09:32.903: Project: AutoML_20201103_10932891
01:09:32.906: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
01:09:32.906: Setting stopping tolerance adaptively based on the training frame: 0.05
01:09:32.906: Build control seed: 42
01:09:32.908: training frame: Frame key: automl_training_Key_Frame__upload_81af2b247db8be27e5b98a5c14084b45.hex    cols: 26    rows: 15  chunks: 1    size: 5041  checksum: -3098367329305361756
01:09:32.908: validation frame: Frame key: Key_Frame__upload_badff9fede1f66157cdbda1c2b0431f.hex    cols: 26    rows: 6  chunks: 1    size: 3502  checksum: 977281445782166532
01:09:32.909: leaderboard frame: NULL
01:09:32.909: blending frame: NULL
01:09:32.909: response column: 50 cm
01:09:32.909: fold column: n

In [14]:
print("\n-----------------------------------------------End of process-----------------------------------------------")


-----------------------------------------------End of process-----------------------------------------------
