In [None]:
!pip install mlflow --quiet
!pip install pyngrok --quiet
!pip install boto3
!pip install -q shap
!pip install pyyaml

[K     |████████████████████████████████| 17.9 MB 17.3 MB/s 
[K     |████████████████████████████████| 209 kB 63.4 MB/s 
[K     |████████████████████████████████| 181 kB 40.3 MB/s 
[K     |████████████████████████████████| 81 kB 7.0 MB/s 
[K     |████████████████████████████████| 79 kB 5.5 MB/s 
[K     |████████████████████████████████| 146 kB 20.9 MB/s 
[K     |████████████████████████████████| 596 kB 42.6 MB/s 
[K     |████████████████████████████████| 54 kB 1.4 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[K     |████████████████████████████████| 78 kB 7.5 MB/s 
[?25h  Building wheel for databricks-cli (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 745 kB 12.4 MB/s 
[?25h  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.24.48-py3-none-any.whl (132 kB)
[K     |██████████████████

In [None]:
import os
import warnings
import sys
import pandas as pd
import numpy as np
import xgboost

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import tensorflow as tf
from keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU, RNN
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from tensorflow.keras.models import Sequential

from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import logging
import boto3



In [None]:
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive/')

In [None]:
#yaml file on the upper level to protect rest of the data
os.chdir('/content/drive/My Drive/')

In [None]:
with open('/content/drive/My Drive/conf/mlconfig.yml') as f:
    data = yaml.load(f, Loader=yaml.FullLoader)

In [None]:
def preprocessing(data):
  #this is simply a trick to remove the last column which tends to be always NaN for some reason
  data.drop(data.columns[len(data.columns)-1], axis=1, inplace=True)
  #print(data)
  data.drop(columns='End-Date', inplace=True)

  data["Price_Change_rel"] = pd.to_numeric(data["Price_Change_rel"], downcast="float")
  data["Schädling-TotalFrequency"] = pd.to_numeric(data["Schädling-TotalFrequency"], downcast="float")
  data["Schädling-MeanSentiment"] = pd.to_numeric(data["Schädling-MeanSentiment"], downcast="float")
  data["GeneralWoodDamage-TotalFrequency"] = pd.to_numeric(data["GeneralWoodDamage-TotalFrequency"], downcast="float")
  data["GeneralWoodDamage-MeanSentiment"] = pd.to_numeric(data["GeneralWoodDamage-MeanSentiment"], downcast="float")
  data["Wetter-TotalFrequency"] = pd.to_numeric(data["Wetter-TotalFrequency"], downcast="float")
  data["Wetter-MeanSentiment"] = pd.to_numeric(data["Wetter-MeanSentiment"], downcast="float")
  data["Folgeindustrie-TotalFrequency"] = pd.to_numeric(data["Folgeindustrie-TotalFrequency"], downcast="float")
  data["Folgeindustrie-MeanSentiment"] = pd.to_numeric(data["Folgeindustrie-MeanSentiment"], downcast="float")
  data["rolling-Schädling-MeanSentiment"] = pd.to_numeric(data["rolling-Schädling-MeanSentiment"], downcast="float")
  data["rolling-GeneralWoodDamage-MeanSentiment"] = pd.to_numeric(data["rolling-GeneralWoodDamage-MeanSentiment"],
                                                                    downcast="float")
  data["rolling-Wetter-MeanSentiment"] = pd.to_numeric(data["rolling-Wetter-MeanSentiment"], downcast="float")
  data["rolling-Folgeindustrie-MeanSentiment"] = pd.to_numeric(data["rolling-Folgeindustrie-MeanSentiment"],
                                                                 downcast="float")

  data.drop(columns='BEDAT', inplace=True)
  X = data.drop(['Price_Change_rel'],axis=1).values
  y = data['Price_Change_rel']
  return train_test_split(X, y, test_size=0.2)

In [None]:
def eval_metrics(actual, pred):
  rmse = np.sqrt(mean_squared_error(actual, pred))
  mae = mean_absolute_error(actual, pred)
  r2 = r2_score(actual, pred)
  return rmse, mae, r2

In [None]:
def get_model(params, input_shape):
	model = Sequential()
	model.add(GRU(units=params["gru_units"], return_sequences=True, input_shape=(input_shape, 1)))
	model.add(Dropout(rate=params["dropout"]))

	#model.add(GRU(units=params["rnn_units"], return_sequences=True))
	#model.add(Dropout(rate=params["dropout"]))

	#model.add(GRU(units=params["rnn_units"], return_sequences=True))
	#model.add(Dropout(rate=params["dropout"]))

	model.add(GRU(units=params["gru_units"], return_sequences=False))
	model.add(Dropout(rate=params["dropout"]))

	model.add(Dense(1))

	model.compile(loss=params["loss"],
              	optimizer=params["optimizer"],
              	metrics=[RootMeanSquaredError(), MeanAbsoluteError()])

	return model

In [None]:
def mlflow_start():
  remote_server_uri = data['remote_server_uri']
  user_name = data['username']
  password = data['password']
  mlflow.set_tracking_uri(remote_server_uri)
  os.environ['MLFLOW_TRACKING_USERNAME'] = user_name
  os.environ['MLFLOW_TRACKING_PASSWORD'] = password
  os.environ['MLFLOW_S3_ENDPOINT_URL'] = data['s3_enpoint_url']
  os.environ['AWS_ACCESS_KEY_ID'] = data['aws_access_key_id']
  os.environ['AWS_SECRET_ACCESS_KEY'] = data['aws_secret_access_key']

mlflow_start()

In [None]:
# Load the Drive helper and mount
#from google.colab import drive

# This will prompt for authorization.
#drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
def main():
  warnings.filterwarnings("ignore")
  np.random.seed(40)

  #data = []

  try:
    #ds = 'softwoodshort'
    #ds = 'hardwoodshort'
    #ds = 'nadelholz'
    ds = 'rohpapier'

    dataset_drive = data['dataset_drive']
    experiment_name = data['experiment_name']
    dataset = dataset_drive + ds + ".csv"
    with open(dataset) as f:
      data = pd.read_csv(f, delimiter=',')
      print(data.head())

      X_train, X_test, y_train, y_test = preprocessing(data)

      n_estimators = 100 #int(sys.argv[1]) if len(sys.argv) > 1 else 1000
      max_features = 9 #int(sys.argv[2]) if len(sys.argv) > 2 else 9
      max_depth = 5 #int(sys.argv[3]) if len(sys.argv) > 3 else 5

      mlflow.end_run()
      mlflow.set_experiment(experiment_name)

      with mlflow.start_run(run_name='gru2_d') as run:
          np.seterr(invalid='ignore')

          params = {
              "loss": "mean_squared_error",
              "optimizer": "adam",
              "dropout": 0.5,
              "gru_units": 90,
              "epochs": 50,
              "batch_size": 4,
              "es_patience": 10
          }

          model = get_model(params=params, input_shape=X_train.shape[1])

          es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error',
                                                       mode='min',
                                                       patience=params["es_patience"])

          model.fit(
              X_train,
              y_train,
              validation_data=(X_test, y_test),
              epochs=params["epochs"],
              batch_size=params["batch_size"],
              verbose=1
              #callbacks=[neptune_callback, es_callback]
          )

          y_pred = model.predict(X_test)

          (rmse, mae, r2) = eval_metrics(y_test, y_pred)
          print("GRU model (n_estimators=%d, max_features=%d, max_depth=%d):" % (
          n_estimators, max_features, max_depth))
          print("  RMSE: %s" % rmse)
          print("  MAE: %s" % mae)
          print("  R2: %s" % r2)

          #TODO mlflow flavor
          mlflow.log_param("ds", ds)
          mlflow.log_param("n_estimators", n_estimators)
          mlflow.log_param("max_features", max_features)
          mlflow.log_param("max_depth", max_depth)
          metrics = {"rmse": float(rmse), "r2": float(r2), "mae": float(mae)} 
          #mlflow.log_metric("rmse", rmse)
          #mlflow.log_metric("r2", r2)
          #mlflow.log_metric("mae", mae)
          mlflow.log_metrics(metrics)

          tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

          if tracking_url_type_store != "file":
            #mlflow.sklearn.log_model(model, "model", registered_model_name="GRU")
            mlflow.log_dict(metrics, "dir/data.json")
          else:
            mlflow.sklearn.log_model(model, "model")

  except Exception as e:
    logger.exception(
        "Unable to download training & test CSV, check your internet connection. Error: %s", e
    )


main()

        BEDAT  Price_Change_rel  End-Date  Schädling-TotalFrequency  \
0  2019-01-01          0.000428  1/1/2019                       0.0   
1  2019-01-02          0.000428  1/2/2019                       1.0   
2  2019-01-03          0.000428  1/3/2019                       3.0   
3  2019-01-04          0.000428  1/4/2019                       3.0   
4  2019-01-05          0.000428  1/5/2019                       0.0   

   Schädling-MeanSentiment  GeneralWoodDamage-TotalFrequency  \
0                     0.00                               1.0   
1                    -0.06                               1.0   
2                    -0.97                               0.0   
3                     0.01                               1.0   
4                     0.00                               0.0   

   GeneralWoodDamage-MeanSentiment  Wetter-TotalFrequency  \
0                             0.00                    0.0   
1                            -0.41                    1.0   
2    