# Notebook for generating audit, model, task jsons, and code.py + sessionInfo.txt - basically everything

# Initialize below functions

In [106]:
def parse_np_matrix_to_json(a):
    output = {}
    for i in range(len(a[0])):
        output[str(a[0][i])] = int(a[1][i])
    return output

def summarize_categorical_variable(values, name, model):
    model['preprocessing'][name] = {
        "name": name,
        "type": "categorical",
        "number_of_unique_values": len(np.unique(values.dropna())),
        "number_of_missing_values": int(values.isna().sum()),
        "cat_frequencies": parse_np_matrix_to_json(np.unique(values.dropna(), return_counts=True)),
        "num_minimum": None,
        "num_1qu": None,
        "num_median": None,
        "num_mean": None,
        "num_3qu": None,
        "num_maximum": None
    }
    
def summarize_numerical_variable(values, name, model):
    model['preprocessing'][name] = {
        "name": name,
        "type": "numerical",
        "number_of_unique_values": len(np.unique(values.dropna())),
        "number_of_missing_values": int(values.isna().sum()),
        "cat_frequencies": None,
        "num_minimum": float(np.min(values.dropna())),
        "num_1qu": float(np.percentile(values.dropna(),25)),
        "num_median": float(np.percentile(values.dropna(),50)),
        "num_3qu": float(np.percentile(values.dropna(),75)),
        "num_maximum": float(np.max(values.dropna()))
    }
    
def generate_task_json(dataset_id, task_type, task_target, added_by):
    global task_id, date
    date = datetime.datetime.now().strftime("%d-%m-%Y")
    task_id = f'{task_type}_{task_target}'
    task = {
        "id": task_id,
        "added_by": added_by,
        "date": date,
        "dataset_id": dataset_id,
        "type": task_type,
        "target": task_target
    }

    with open('task.json', 'w') as fp:
        json.dump([task], fp, indent=4)
        
def load_data(dataset_openml_id, task_traget):
    global names
    np.random.seed(42)
    
    dataset = openml.datasets.get_dataset(dataset_openml_id)
    (X, y, categorical, names) = dataset.get_data(
        target=dataset.default_target_attribute,
        return_categorical_indicator=True,
        return_attribute_names=True,
        include_ignore_attributes=True
    )

    vals = {}
    for i, name in enumerate(names):
        vals[name] = X[:, i]
    vals[dataset.default_target_attribute] = y
    df = pd.DataFrame(vals)
    X = df.drop(task_target, axis=1)
    y = df.loc[:, task_target]
    return train_test_split(X, y, test_size=0.2, random_state=42)

def generate_model_audit_json_and_code_py_for_classifier(classifier_class, params, repository_absolute_path):
    classifier = classifier_class(**params)

    classifier.fit(X_train, y_train)

    md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()

    model = {
        "id": md5,
        "added_by": added_by,
        "date": date,
        "library": "scikit",
        "model_name": classifier_class.__name__,
        "task_id": task_id,
        "dataset_id": dataset_id,
        "parameters": classifier.get_params(),
        "preprocessing": {}
    }

    print(f'md5 hash: {md5}')

    for i in range(len(names)):
        if categorical[i]:
            summarize_categorical_variable(X_train.loc[:,names[i]], names[i], model)
        else:
            summarize_numerical_variable(X_train.loc[:,names[i]], names[i], model)

    try:
        os.mkdir(f'{repository_absolute_path}/models/{dataset_id}/{task_id}/{md5}')
    except:
        print(f'Directory {md5} already exists')

    with open(f'{repository_absolute_path}/models/{dataset_id}/{task_id}/{md5}/model.json', 'w') as fp:
        json.dump([model], fp, indent=4)

    y_pred = classifier.predict(transform_pipeline.transform(X_test))
    y_pred_proba = classifier.predict_proba(transform_pipeline.transform(X_test))[:,1]

    audit = {"id": f'audit_{md5}',
             "date": datetime.datetime.now().strftime("%d-%m-%Y"),
             "added_by": added_by,
             "model_id": md5,
             "task_id": task_id,
             "dataset_id": dataset_id}

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    audit['performance'] = {
                 "acc": accuracy_score(y_test, y_pred),
                 "auc": roc_auc_score(y_test, y_pred_proba),
                 "precision": precision_score(y_test, y_pred),
                 "recall": recall_score(y_test, y_pred),
                 "specificity": tn/(tn+fp),
                 "f1": f1_score(y_test, y_pred)
             }

    with open(f'{repository_absolute_path}/models/{dataset_id}/{task_id}/{md5}/audit.json', 'w') as fp:
        json.dump([audit], fp, indent=4)
        
    sessionInfo = {
        "python_version": python_version(),
        "library_versions":[str(d) for d in pkg_resources.working_set]
    }
    with open(f'{repository_absolute_path}/models/{dataset_id}/{task_id}/{md5}/sessionInfo.txt', 'w') as f:
        json.dump(sessionInfo, f, indent=4)
    
    code_py = f"""
#:# libraries
from {classifier_class.__module__} import {classifier_class.__name__}
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
from matplotlib import pyplot as plt
from platform import python_version

import openml
import pandas as pd
import numpy as np
import hashlib
import pkg_resources
import datetime
import json
import os

#:# config

np.random.seed(42)

#:# data

datasetId = {dataset_openml_id}
task_target = '{task_target}'

dataset = openml.datasets.get_dataset(datasetId)
(X, y, categorical, names) = dataset.get_data(
    target=dataset.default_target_attribute,
    return_categorical_indicator=True,
    return_attribute_names=True,
    include_ignore_attributes=True
)

vals = {{}}
for i, name in enumerate(names):
    vals[name] = X[:, i]
vals[dataset.default_target_attribute] = y
df = pd.DataFrame(vals)

X = df.drop(task_target, axis=1)
y = df.loc[:, task_target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#:# preprocessing

transform_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns)

#:# model

params = {params}

classifier = {classifier_class.__name__}(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# {md5}
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {{md5}}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(transform_pipeline.transform(X_test))[:,1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {{accuracy_score(y_test, y_pred)}}')
print(f'auc: {{roc_auc_score(y_test, y_pred_proba)}}')
print(f'precision: {{precision_score(y_test, y_pred)}}')
print(f'recall: {{recall_score(y_test, y_pred)}}')
print(f'specificity: {{tn/(tn+fp)}}')
print(f'f1: {{f1_score(y_test, y_pred)}}')

#:# session info

# Dodaj wersję pythona w session info

sessionInfo = {{
    "python_version": python_version(),
    "library_versions":[str(d) for d in pkg_resources.working_set]
}}
with open('sessionInfo.txt', 'w') as f:
    json.dump(sessionInfo, f, indent=4)
    """
    with open(f'{repository_absolute_path}/models/{dataset_id}/{task_id}/{md5}/code.py', 'w') as fp:
        fp.write(code_py)

# Dependencies

In [107]:
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, ParameterGrid
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm.classes import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
from matplotlib import pyplot as plt
from platform import python_version

import openml
import pandas as pd
import numpy as np
import hashlib
import pkg_resources
import datetime
import json
import os
import re

import warnings
warnings.filterwarnings('ignore')

# Specify metadata

In [108]:
added_by = "Siemashko"
dataset_id = "openml_boston"
task_type = "classification"
task_target = "binaryClass"
dataset_openml_id = 853
repository_absolute_path = "/home/siemashko/Desktop/2019L-WarsztatyBadawcze"
generate_task_json(dataset_id, task_type, task_target, added_by)

# Preprocess the data

In [109]:
X_train, X_test, y_train, y_test = load_data(dataset_openml_id, task_target)
transform_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns)

# Set models to create and specify the param grids

In [110]:
models_to_create = {}
models_to_create[SVC] = list(ParameterGrid([{'kernel': ['linear'],
                                        "probability": [True],
                                        "C": [1,2,3,5,10]},
                                       {'kernel': ['rbf', 'sigmoid'],
                                        'gamma': [1, 3, 5, 7, 10],
                                        "probability": [True]}]))

models_to_create[RandomForestClassifier] = list(ParameterGrid([{"n_estimators":[25,75,150,300,600],
                                                                "max_depth":[3,5,7]}]))

models_to_create[AdaBoostClassifier] = list(ParameterGrid([{"n_estimators":[25,75,150,300,600],
                                                            "learning_rate": [0.5,0.75,1.0]}]))

models_to_create[LogisticRegression] = list(ParameterGrid([{"solver": ['liblinear', 'sag', 'saga'],
                                                            "C": [0.2, 0.4, 0.6, 0.8, 1.0]}]))

# Let's generate everything!

In [111]:
for classifier_class, param_grid in models_to_create.items():
    for param_set in param_grid:
        generate_model_audit_json_and_code_py_for_classifier(classifier_class, param_set, repository_absolute_path)

md5 hash: 43d227cc7efa0a4cee5561ca8fa67cd6
md5 hash: 5d0ef2943274babaa8ffdc44806969fa
md5 hash: b76f91e625461db4ff79add1a1f6b2fd
md5 hash: 1b2ae3896802721475b2b96e5717707a
md5 hash: 6652b5806fde3819cc18722add2b9de2
md5 hash: de79316bb99fb627637bec6d2d8ebf9a
md5 hash: 5450f634b38d0176cd0fee5c58d119ae
md5 hash: fb5079f665ad626649b88a2f9184c9b1
md5 hash: df94da09f57f342815273eff788904e3
md5 hash: 6232a6d03c170d5991895983968835ac
md5 hash: aad366f6d5961bc98783c2ad9fb3918d
md5 hash: a1037b17a14bb76f44b4341d88d32d1c
md5 hash: 0570deacf01bb82e54171b62cbf5e909
md5 hash: 450a83cf002478d586f76a1a2e2acc9a
md5 hash: fd39940d73b3520bc363cc9825c33449
md5 hash: e352321c86c86dd2c42f72e19c802015
md5 hash: 5475503c9e4b64dc0dcc4960399cf72c
md5 hash: e763412d8e1070cf29ed7ed5f8d67a8e
md5 hash: bf7d60a8ac3d7cac141c27c44bce5ddd
md5 hash: 1fa5683f41d41a252c84a84d0ac110e2
md5 hash: 15a5d9326e73c72861477f8b912aaa4a
md5 hash: 8187bc79526114bd041f226851977941
md5 hash: e315788413ea98a4d62051e59f55a3f7
md5 hash: c