# Notebook for generating audit, model and task jsons

### *Use only when the model works perfectly well as 'code.py' script*

# Initialize below functions

In [20]:
def parse_np_matrix_to_json(a):
    output = {}
    for i in range(len(a[0])):
        output[str(a[0][i])] = int(a[1][i])
    return output

def summarize_categorical_variable(values, name, model):
    model['preprocessing'][name] = {
        "name": name,
        "type": "categorical",
        "number_of_unique_values": len(np.unique(values.dropna())),
        "number_of_missing_values": int(values.isna().sum()),
        "cat_frequencies": parse_np_matrix_to_json(np.unique(values.dropna(), return_counts=True)),
        "num_minimum": None,
        "num_1qu": None,
        "num_median": None,
        "num_mean": None,
        "num_3qu": None,
        "num_maximum": None
    }
    
def summarize_numerical_variable(values, name, model):
    model['preprocessing'][name] = {
        "name": name,
        "type": "numerical",
        "number_of_unique_values": len(np.unique(values.dropna())),
        "number_of_missing_values": int(values.isna().sum()),
        "cat_frequencies": None,
        "num_minimum": float(np.min(values.dropna())),
        "num_1qu": float(np.percentile(values.dropna(),25)),
        "num_median": float(np.percentile(values.dropna(),50)),
        "num_3q": float(np.percentile(values.dropna(),75)),
        "num_maximum": float(np.max(values.dropna()))
    }

# Dependencies

In [21]:
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_curve, classification_report, confusion_matrix, roc_auc_score
from matplotlib import pyplot as plt

import openml
import pandas as pd
import numpy as np
import hashlib
import pkg_resources
import datetime
import json

# Metadata

In [23]:
added_by = "Siemashko"
date = datetime.datetime.now().strftime("%d-%m-%Y")
dataset_id = "openml_boston"
task_type = "classification"
task_target = "binaryClass"
task_id = f'{task_type}_{task_target}'
task = {
    "id": task_id,
    "added_by": added_by,
    "date": date,
    "dataset_id": dataset_id,
    "type": task_type,
    "target": task_target
}

with open('task.json', 'w') as fp:
    json.dump([task], fp, indent=4)

# Load the data for your model

In [24]:
datasetOpenmlId = 853

dataset = openml.datasets.get_dataset(datasetOpenmlId)
(X, y, categorical, names) = dataset.get_data(
    target=dataset.default_target_attribute,
    return_categorical_indicator=True,
    return_attribute_names=True,
    include_ignore_attributes=True
)

vals = {}
for i, name in enumerate(names):
    vals[name] = X[:, i]
vals[dataset.default_target_attribute] = y
df = pd.DataFrame(vals)

X = df.drop('binaryClass', axis=1)
y = df.loc[:, 'binaryClass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Model and md5 hash - remember to remain correct order of 'categorical' variable

In [25]:
classifier = LogisticRegression()

classifier.fit(X_train, y_train)

md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()

model = {
    "id": md5,
    "added_by": added_by,
    "date": date,
    "task_id": task_id,
    "dataset_id": dataset_id,
    "parameters": classifier.get_params(),
    "preprocessing": {}
}

for i in range(len(names)):
    if categorical[i]:
        summarize_categorical_variable(X_train.loc[:,names[i]], names[i], model)
    else:
        summarize_numerical_variable(X_train.loc[:,names[i]], names[i], model)
        

with open('model.json', 'w') as fp:
    json.dump([model], fp, indent=4)

# Audit

In [27]:
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)[:,1]

audit = {"id": f'audit_{md5}',
         "date": datetime.datetime.now().strftime("%d-%m-%Y"),
         "added_by": "Siemashko",
         "model_id": md5,
         "task_id": "classification_binaryClass",
         "dataset_id": "openml_boston"}

audit['performance'] = {
             "ACC": classifier.score(X_test, y_test),
             "AUC": roc_auc_score(y_test, y_pred_proba)
         }

with open('audit.json', 'w') as fp:
    json.dump([audit], fp, indent=4)