In [0]:
dbutils.library.installPyPI("mlflow")
#dbutils.library.restartPython()

In [0]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import mlflow
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
from sklearn.ensemble import IsolationForest

In [0]:
%sql
CREATE OR REPLACE VIEW events_nc AS 
SELECT DATE_TRUNC('day', timestamp) as timestamp,
       event.actor.id,
       count(distinct event.actor.ip_addresses) as nr_ip_addresses,
       count(distinct event.actor.user_agent) as nr_user_agents,
       count(*) as nr_contact_reads
FROM better
WHERE event.audit.operation = "contact_read"
AND event.actor.id is not null
AND event.actor.id != "N/A"
GROUP BY DATE_TRUNC('day', timestamp), event.actor.id


In [0]:
class IForest:

    def __init__(self, scaled_user_ts, contamination=0.01, random_state=0, verbose=0, max_features=1.0):
    
        self.model =  IsolationForest(n_estimators=100, contamination=contamination, random_state=random_state, verbose=verbose, max_features=max_features)

        #scaler = StandardScaler()
        self.model.fit(scaled_user_ts)
    
    def predict(self, ts):
        return self.model.predict(ts)#, abs(self.model.decision_function(ts)) * 100

In [0]:
data = spark.table('events_nc').toPandas()
data = data[['nr_ip_addresses', 'nr_user_agents', 'nr_contact_reads']]

In [0]:
contamination = 0.001

experiment_id = mlflow.set_experiment('/nc-multivariate')
with mlflow.start_run():

  #Training Data
  iforest = IForest(data,contamination=contamination,random_state=42,verbose=0)
  #model =  IsolationForest(n_estimators=100, contamination=contamination, random_state=42, verbose=0, max_features=max_features)
  #model.fit(data)
  #pred = model.predict(data)

  mlflow.log_param("contamination",contamination)
  #mlflow.log_param("max_features",max_features)
  #mlflow.log_metric("n_anomalies", rmse)

  mlflow.sklearn.log_model(
        sk_model=iforest,
        artifact_path="sklearn-model",
        registered_model_name="nc_multivariate"
    )
  mlflow.end_run()
  
#  client = mlflow.tracking.MlflowClient()
#  client.transition_model_version_stage(
#    name='nc_multivariate',
#    version='',
#    stage='Production'
#    )