##1.2 Install and load libraries

In [None]:
!pip install wandb

In [None]:
import logging
import wandb
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import fbeta_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# Login to Weights & Biases
!wandb login --relogin

##1.3 Holdout Configuration


In [None]:
# global variables

# ratio used to split train and validation data
val_size = 0.30

# seed used to reproduce purposes
seed = 41

# reference (column) to stratify the data
stratify = "Outcome"

# name of the input artifact
artifact_input_name = "diabetes_nn/train.csv:latest"

# type of the artifact
artifact_type = "Train"

In [None]:
# configure logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",
                    datefmt='%d-%m-%Y %H:%M:%S')

# reference for a logging obj
logger = logging.getLogger()

# initiate the wandb project
run = wandb.init(project="diabetes_nn",job_type="train")

logger.info("Downloading and reading train artifact")
local_path = run.use_artifact(artifact_input_name).file()
df_train = pd.read_csv(local_path)

# Spliting train.csv into train and validation dataset
logger.info("Spliting data into train/val")
# split-out train/validation and test dataset
x_train, x_val, y_train, y_val = train_test_split(df_train.drop(labels=stratify,axis=1),
                                                  df_train[stratify],
                                                  test_size=val_size,
                                                  random_state=seed,
                                                  shuffle=True,
                                                  stratify=df_train[stratify])

In [None]:
logger.info("x train: {}".format(x_train.shape))
logger.info("y train: {}".format(y_train.shape))
logger.info("x val: {}".format(x_val.shape))
logger.info("y val: {}".format(y_val.shape))

22-07-2022 03:31:14 x train: (375, 8)
22-07-2022 03:31:14 y train: (375,)
22-07-2022 03:31:14 x val: (162, 8)
22-07-2022 03:31:14 y val: (162,)


##1.4 Data preparation

Data preparation may be the most important part of a machine learning project.

It is the most time-consuming part, although it seems to be the least discussed topic.

Data preparation sometimes referred to as data preprocessing, is the act of transforming raw data into a form that is appropriate for modeling.

Machine learning algorithms require input data to be numbered, and most algorithm implementations maintain this expectation. As such, if your data contains data types and values that are not numbers, such as labels, you will need to change the data into numbers. Further, specific machine learning algorithms have expectations regarding the data types, scale, probability distribution, and relationships between input variables, and you may need to change the data to meet these expectations.

The philosophy of data preparation is to discover how to best expose the unknown underlying structure of the problem to the learning algorithms. This often requires an iterative path of experimentation through a suite of different data preparation techniques in order to discover what works well or best. The vast majority of the machine learning algorithms you may use on a project are years to decades old. The implementation and application of the algorithms are well understood. So much so that they are routine, with amazing, fully-featured open-source machine learning libraries like scikit-learn in Python.

The thing that is different from project to project is the data.
You may be the first person (ever!) to use a specific dataset as the basis for a predictive modeling project. As such, the preparation of the data in order to best present it to the problem of the learning algorithms is the primary task of any modern machine learning project.

The challenge of data preparation is that each dataset is unique and different.
Datasets differ in the number of variables (tens, hundreds, thousands, or more), the types of the variables (numeric, nominal, ordinal, boolean), the scale of the variables, the drift in the values over time, and more. As such, this makes discussing data preparation a challenge. Either specific case studies are used, or focus is put on the general methods that can be used across projects. The result is that neither approach is explored.

###1.4.1 Outlier removal

In [None]:
logger.info("Outlier Removal")
# temporary variable
x = x_train.select_dtypes("int64").copy()
x = x.join(x_train.select_dtypes("float64").copy())

# identify outlier in the dataset
lof = LocalOutlierFactor()
outlier = lof.fit_predict(x)
mask = outlier != -1

22-07-2022 03:07:51 Outlier Removal


In [None]:
logger.info("x_train shape [original]: {}".format(x_train.shape))
logger.info("x_train shape [outlier removal]: {}".format(x_train.loc[mask,:].shape))

22-07-2022 03:07:51 x_train shape [original]: (375, 8)
22-07-2022 03:07:51 x_train shape [outlier removal]: (346, 8)


In [None]:
# AVOID data leakage and you should not do this procedure in the preprocessing stage
# Note that we did not perform this procedure in the validation set
x_train = x_train.loc[mask,:].copy()
y_train = y_train[mask].copy()

###1.4.2 Encoding Target Variable (not necessary for diabetes dataset)

In [None]:
y_train.head(10)

In [None]:
logger.info("Encoding Target Variable")
# define a categorical encoding for target variable
le = LabelEncoder()

# fit and transform y_train
y_train = le.fit_transform(y_train)

# transform y_test (avoiding data leakage)
y_val = le.transform(y_val)

logger.info("Classes [0, 1]: {}".format(le.inverse_transform([0, 1])))

In [None]:
y_train

In [None]:
y_val

###1.4.3 Encoding independent variables (not necessary for diabetes dataset)

In [None]:
x_train.head()

In [None]:
x_val.head()

### Dont execute

In [None]:
# just an experimentation

# drop=first erase redundant column
onehot = OneHotEncoder(sparse=False,drop="first")

# fit using x_train
onehot.fit(x_train["New_BMI_Range"].values.reshape(-1,1))

# make a copy
x_train_aux = x_train.copy()

# transform train 
x_train_aux[onehot.get_feature_names_out()] = onehot.transform(x_train_aux["New_BMI_Range"].values.reshape(-1,1))
x_train_aux.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,New_BMI_Range,New_Glucose_Class,New_BloodPressure,x0_Obese,x0_Overweight,x0_Underweight
284,0,147,85,54,0,42.8,0.375,24,Obese,Prediabetes,HS1,1.0,0.0,0.0
412,8,107,80,0,0,24.6,0.856,34,Healty,Normal,HS1,0.0,0.0,0.0
266,1,88,62,24,44,29.9,0.422,23,Obese,Normal,Normal,1.0,0.0,0.0
258,3,148,66,25,0,32.5,0.256,22,Obese,Prediabetes,Normal,1.0,0.0,0.0
109,10,162,84,0,0,27.7,0.182,54,Overweight,Prediabetes,HS1,0.0,1.0,0.0


In [None]:
# Inverse transformation
onehot.inverse_transform(np.array([0,1]).reshape(-1,1))

In [None]:
# Inverse transformation (other way)
onehot.inverse_transform([[0],[1]])

In [None]:
# Return the name of the new feature
onehot.get_feature_names_out()

array(['x0_Obese', 'x0_Overweight', 'x0_Underweight'], dtype=object)

###1.4.4 Encoding independent variable (not necessary for diabetes dataset)

In [None]:
x_train.dtypes

In [None]:
# Show the shape of x train before transformation
x_train.shape

In [None]:
# Lets review what are categorical columns
x_train.select_dtypes("object").columns.to_list()


In [None]:
x_train.head()

In [None]:
x_val.head()

### Dont execute

In [None]:
# 08 columns are "object", lets transform them to categorical 
for name in x_train.select_dtypes("object").columns.to_list():
    onehot = OneHotEncoder(sparse=False,drop="first")
    # fit using x_train
    onehot.fit(x_train[name].values.reshape(-1,1))

    # transform train and validation
    x_train[onehot.get_feature_names_out()] = onehot.transform(x_train[name].values.reshape(-1,1))
    x_val[onehot.get_feature_names_out()] = onehot.transform(x_val[name].values.reshape(-1,1))

In [None]:
x_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,New_BMI_Range,New_Glucose_Class,New_BloodPressure,x0_Obese,x0_Overweight,x0_Underweight,x0_Prediabetes,x0_HS2,x0_Normal
284,0,147,85,54,0,42.8,0.375,24,Obese,Prediabetes,HS1,1.0,0.0,0.0,1.0,0.0,0.0
412,8,107,80,0,0,24.6,0.856,34,Healty,Normal,HS1,0.0,0.0,0.0,0.0,0.0,0.0
266,1,88,62,24,44,29.9,0.422,23,Obese,Normal,Normal,1.0,0.0,0.0,0.0,0.0,1.0
258,3,148,66,25,0,32.5,0.256,22,Obese,Prediabetes,Normal,1.0,0.0,0.0,1.0,0.0,1.0
109,10,162,84,0,0,27.7,0.182,54,Overweight,Prediabetes,HS1,0.0,1.0,0.0,1.0,0.0,0.0


In [None]:
x_val.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,New_BMI_Range,New_Glucose_Class,New_BloodPressure,x0_Obese,x0_Overweight,x0_Underweight,x0_Prediabetes,x0_HS2,x0_Normal
300,10,115,0,0,0,0.0,0.261,30,Underweight,Normal,Normal,0.0,0.0,1.0,0.0,0.0,1.0
158,6,91,0,0,0,29.8,0.501,31,Overweight,Normal,Normal,0.0,1.0,0.0,0.0,0.0,1.0
307,4,116,72,12,87,22.1,0.463,37,Healty,Normal,Normal,0.0,0.0,0.0,0.0,0.0,1.0
32,5,115,98,0,0,52.9,0.209,28,Obese,Normal,HS2,1.0,0.0,0.0,0.0,1.0,0.0
116,2,114,68,22,0,28.7,0.092,25,Overweight,Normal,Normal,0.0,1.0,0.0,0.0,0.0,1.0


In [None]:
cols=['New_BMI_Range','New_Glucose_Class','New_BloodPressure']

In [None]:
x_train.drop(labels=cols,axis=1,inplace=True)
x_val.drop(labels=cols,axis=1,inplace=True)

In [None]:
x_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,x0_Obese,x0_Overweight,x0_Underweight,x0_Prediabetes,x0_HS2,x0_Normal
284,0,147,85,54,0,42.8,0.375,24,1.0,0.0,0.0,1.0,0.0,0.0
412,8,107,80,0,0,24.6,0.856,34,0.0,0.0,0.0,0.0,0.0,0.0
266,1,88,62,24,44,29.9,0.422,23,1.0,0.0,0.0,0.0,0.0,1.0
258,3,148,66,25,0,32.5,0.256,22,1.0,0.0,0.0,1.0,0.0,1.0
109,10,162,84,0,0,27.7,0.182,54,0.0,1.0,0.0,1.0,0.0,0.0


In [None]:
x_val.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,x0_Obese,x0_Overweight,x0_Underweight,x0_Prediabetes,x0_HS2,x0_Normal
300,10,115,0,0,0,0.0,0.261,30,0.0,0.0,1.0,0.0,0.0,1.0
158,6,91,0,0,0,29.8,0.501,31,0.0,1.0,0.0,0.0,0.0,1.0
307,4,116,72,12,87,22.1,0.463,37,0.0,0.0,0.0,0.0,0.0,1.0
32,5,115,98,0,0,52.9,0.209,28,1.0,0.0,0.0,0.0,1.0,0.0
116,2,114,68,22,0,28.7,0.092,25,0.0,1.0,0.0,0.0,0.0,1.0


### run finish

In [None]:
# close the current run before to execute the next section
run.finish()

###1.4.5 Using full-pipeline

#### runing 1.3 again

In [None]:
# global variables

# ratio used to split train and validation data
val_size = 0.30

# seed used to reproduce purposes
seed = 41

# reference (column) to stratify the data
stratify = "Outcome"

# name of the input artifact
artifact_input_name = "diabetes_nn/train.csv:latest"

# type of the artifact
artifact_type = "Train"

In [None]:
# configure logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",
                    datefmt='%d-%m-%Y %H:%M:%S')

# reference for a logging obj
logger = logging.getLogger()

# initiate the wandb project
run = wandb.init(project="diabetes_nn",job_type="train")

logger.info("Downloading and reading train artifact")
local_path = run.use_artifact(artifact_input_name).file()
df_train = pd.read_csv(local_path)

# Spliting train.csv into train and validation dataset
logger.info("Spliting data into train/val")
# split-out train/validation and test dataset
x_train, x_val, y_train, y_val = train_test_split(df_train.drop(labels=stratify,axis=1),
                                                  df_train[stratify],
                                                  test_size=val_size,
                                                  random_state=seed,
                                                  shuffle=True,
                                                  stratify=df_train[stratify])

In [None]:
logger.info("x train: {}".format(x_train.shape))
logger.info("y train: {}".format(y_train.shape))
logger.info("x val: {}".format(x_val.shape))
logger.info("y val: {}".format(y_val.shape))

#### runing 1.4.1 again

In [None]:
logger.info("Outlier Removal")
# temporary variable
x = x_train.select_dtypes("int64").copy()

# identify outlier in the dataset
lof = LocalOutlierFactor()
outlier = lof.fit_predict(x)
mask = outlier != -1

In [None]:
logger.info("x_train shape [original]: {}".format(x_train.shape))
logger.info("x_train shape [outlier removal]: {}".format(x_train.loc[mask,:].shape))

In [None]:
# AVOID data leakage and you should not do this procedure in the preprocessing stage
# Note that we did not perform this procedure in the validation set
x_train = x_train.loc[mask,:].copy()
y_train = y_train[mask].copy()

####runing 1.4.2 again

In [None]:
y_train.head(10)

In [None]:
logger.info("Encoding Target Variable")
# define a categorical encoding for target variable
le = LabelEncoder()

In [None]:
logger.info("Encoding Target Variable")
# define a categorical encoding for target variable
le = LabelEncoder()

# fit and transform y_train
y_train = le.fit_transform(y_train)

# transform y_test (avoiding data leakage)
y_val = le.transform(y_val)

logger.info("Classes [0, 1]: {}".format(le.inverse_transform([0, 1])))

In [None]:
y_train

In [None]:
y_val

####1.4.5.1 Feature Extractor

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    # Class Constructor
    def __init__(self, feature_names):
        self.feature_names = feature_names

    # Return self nothing else to do here
    def fit(self, X, y=None):
        return self

    # Method that describes what this custom transformer need to do
    def transform(self, X, y=None):
        return X[self.feature_names]

In [None]:
x_val.dtypes

In [None]:
# for validation purposes
fs = FeatureSelector(x_train.select_dtypes("object").columns.to_list())
df = fs.fit_transform(x_train)
df.head()

In [None]:
# for validation purposes
fs = FeatureSelector(x_train.select_dtypes("int64").columns.to_list())
df = fs.fit_transform(x_train)
df.head()

In [None]:
# for validation purposes
fs = FeatureSelector(x_train.select_dtypes("float").columns.to_list())
df = fs.fit_transform(x_train)
df.head()

####1.4.5.2 Handling Categorical Features (no categorical features)

In [None]:
x_val

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
300,10,115,0,0,0,0.0,0.261,30
158,6,91,0,0,0,29.8,0.501,31
307,4,116,72,12,87,22.1,0.463,37
32,5,115,98,0,0,52.9,0.209,28
116,2,114,68,22,0,28.7,0.092,25
...,...,...,...,...,...,...,...,...
151,5,147,75,0,0,29.9,0.434,28
193,9,124,70,33,402,35.4,0.282,34
291,7,150,78,29,126,35.2,0.692,54
139,1,117,60,23,106,33.8,0.466,27


In [None]:
# Handling categorical features
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes one boolean as its argument
    def __init__(self, new_features=True, colnames=None):
        self.new_features = new_features
        self.colnames = colnames
        print(self.colnames)

    # Return self nothing else to do here
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self):
        return self.colnames.tolist()

    # Transformer method we wrote for this transformer
    def transform(self, X, y=None):
        df = pd.DataFrame(X, columns=self.colnames)

        # Remove white space in categorical features
        df = df.apply(lambda row: row.str.strip())

        

        # customize feature?
        # How can I identify what needs to be modified? EDA!!!!
        if self.new_features:
        # update column names
          self.colnames = df.columns

        return df

In [None]:
# for validation purposes
fs = FeatureSelector(x_train.select_dtypes("object").columns.to_list())
df = fs.fit_transform(x_train)
df.head()

284
412
266
258
109


In [None]:
df

284
412
266
258
109
...
448
65
106
269
439


In [None]:
# for validation purposes
ct = CategoricalTransformer(new_features=True,colnames=df.columns.tolist())
df_cat = ct.fit_transform(df)
df_cat.head()

[]


284
412
266
258
109


In [None]:
# check the cardinality before and after transformation
x_train.select_dtypes("object").apply(pd.Series.nunique)

Series([], dtype: float64)

In [None]:
# check the cardinality before and after transformation
df_cat.apply(pd.Series.nunique)

Series([], dtype: float64)

####1.4.5.3 Handling Numerical Features


In [None]:
# transform numerical features
class NumericalTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes a model parameter as its argument
    # model 0: minmax
    # model 1: standard
    # model 2: without scaler
    def __init__(self, model=0, colnames=None):
        self.model = model
        self.colnames = colnames
        self.scaler = None

    # Fit is used only to learn statistical about Scalers
    def fit(self, X, y=None):
        df = pd.DataFrame(X, columns=self.colnames)
        # minmax
        if self.model == 0:
            self.scaler = MinMaxScaler()
            self.scaler.fit(df)
        # standard scaler
        elif self.model == 1:
            self.scaler = StandardScaler()
            self.scaler.fit(df)
        return self

    # return columns names after transformation
    def get_feature_names_out(self):
        return self.colnames

    # Transformer method we wrote for this transformer
    # Use fitted scalers
    def transform(self, X, y=None):
        df = pd.DataFrame(X, columns=self.colnames)

        # chage values = 0 from some columns
        i = 0
        for col in df.keys():
            if col != 'Pregnancies':
                a = df[col]
                a = a[a != 0]
                a_mean = round(a.median())
                print(col,a_mean)
                df[col].replace(0,a_mean,inplace=True)
                a_mean = 0
        
        # update columns name
        
        
        self.colnames = df.columns.tolist()
        aaa = df.copy()
        # minmax
        if self.model == 0:
            # transform data
            df = self.scaler.transform(df)
        elif self.model == 1:
            # transform data
            df = self.scaler.transform(df)
        else:
            df = df.values
            
        #df = pd.DataFrame(df, columns=self.colnames)

        return df

In [None]:
# for validation purposes
fs = FeatureSelector(x_train.select_dtypes("int64").columns.to_list())
df = fs.fit_transform(x_train)
df.head()

In [None]:
# for validation purposes
nt = NumericalTransformer(model=2)
df_num = nt.fit_transform(df)
df_num

In [None]:
# return columns name
nt.get_feature_names_out()

####handling float

In [None]:
# transform float features
class FloatTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes a model parameter as its argument
    # model 0: minmax
    # model 1: standard
    # model 2: without scaler
    def __init__(self, model=0, colnames=None):
        self.model = model
        self.colnames = colnames
        self.scaler = None

    # Fit is used only to learn statistical about Scalers
    def fit(self, X, y=None):
        df = pd.DataFrame(X, columns=self.colnames)
        # minmax
        if self.model == 0:
            self.scaler = MinMaxScaler()
            self.scaler.fit(df)
        # standard scaler
        elif self.model == 1:
            self.scaler = StandardScaler()
            self.scaler.fit(df)
        return self

    # return columns names after transformation
    def get_feature_names_out(self):
        return self.colnames

    # Transformer method we wrote for this transformer
    # Use fitted scalers
    def transform(self, X, y=None):
        df = pd.DataFrame(X, columns=self.colnames)

        # chage values = 0 from some columns
        for col in df.keys():
            a = df[col]
            a = a[a != 0]
            a_mean = round(a.median(),1)
            df[col].replace(0,a_mean,inplace=True)

        # update columns name
        self.colnames = df.columns.tolist()
        aaa = df.copy()
        if self.model == 0:
            # transform data
            df = self.scaler.transform(df)
        elif self.model == 1:
            # transform data
            df = self.scaler.transform(df)
        else:
            df = df.values

        return df

In [None]:
fs = FeatureSelector(x_train.select_dtypes("float64").columns.to_list())
df = fs.fit_transform(x_train)
df.head()

In [None]:
nt = FloatTransformer(model=2)
df_num = nt.fit_transform(df)
df_num

In [None]:
# return columns name
nt.get_feature_names_out()

####1.4.5.4 Data Preparation Pipeline

In [None]:
# model = 0 (min-max), 1 (z-score), 2 (without normalization)
numerical_model = 0

# Numerical features to pass down the numerical pipeline
numerical_features = x_train.select_dtypes("int64").columns.to_list()

float_features = x_train.select_dtypes("float").columns.to_list()

numerical_pipeline = Pipeline(steps=[('num_selector', FeatureSelector(numerical_features)),
                                     ('num_transformer', NumericalTransformer(numerical_model, 
                                      colnames=numerical_features)),
                                     ])

# Defining the steps in the float pipeline
float_pipeline = Pipeline(steps=[('flt_selector', FeatureSelector(float_features)),
                                     ('flt_transformer', FloatTransformer(numerical_model,                                      
                                      colnames=float_features)),
                                     ])

# Combine numerical and categorical pieplines into one full big pipeline horizontally
full_pipeline_preprocessing = FeatureUnion(transformer_list=[
                                                             ('flt_pipeline', float_pipeline),
                                                             ('num_pipeline', numerical_pipeline)
                                                             ]
                                           )

In [None]:
# for validation purposes
new_data = full_pipeline_preprocessing.fit_transform(x_train)
# cat_names is a list
num_names = full_pipeline_preprocessing.get_params()["num_pipeline"][1].get_feature_names_out()
# float_names is a list
flt_names = full_pipeline_preprocessing.get_params()["flt_pipeline"][1].get_feature_names_out()



df = pd.DataFrame(new_data,columns = flt_names + num_names)
df.head()

##1.6 Hyperparameter tuning

In [None]:
# global seed
seed = 41

In [None]:
# To see all parameters of a specific step of the pipeline
# pipe.named_steps['classifier'].get_params() #or
# pipe.named_steps['full_pipeline'].get_params()

In [None]:
sweep_config = {
    'method': 'random', #grid, random
    'metric': {
      'name': 'binary_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'layer_1': {
            'max': 512,
            'min': 8,
            'distribution': 'int_uniform',
        },
        'layer_2': {
            'max': 512,
            'min': 8,
            'distribution': 'int_uniform',
        },
        'learn_rate': {
            'min': -4,
            'max': -2,
            'distribution': 'log_uniform',  
        },
        'epoch': {
            'values': [30,40,60]
        },
        'batch_size': {
            'values': [10,20]
        }
    }
}
sweep_id = wandb.sweep(sweep_config, entity="mgoldbarg", project="diabetes_nn")

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
from wandb.keras import WandbCallback
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
def train():
    with wandb.init() as run:

        # The full pipeline 
        pipe = Pipeline(steps = [('full_pipeline', full_pipeline_preprocessing)
                                    
                                    ]
                        )
        # training
        logger.info("Training")
        train_x = pipe.fit_transform(x_train, y_train)
        val_x = pipe.fit_transform(x_val, y_val)

        defaults = dict(layer_1 = 16,
                layer_2 = 128,
                learn_rate = 0.01,
                batch_size = 10,
                epoch = 300)


  
        wandb.init(project="diabetes_nn", config= defaults)



        config = wandb.config

                
        model = tf.keras.Sequential([
                                                keras.layers.InputLayer(input_shape=(8,)),
                                                keras.layers.Dense(config.layer_1, name="dense1",kernel_initializer='he_uniform',kernel_regularizer=tf.keras.regularizers.l2(0.001)),
                                                keras.layers.BatchNormalization(name="bn1"),
                                                keras.layers.ReLU(),                               
                                                keras.layers.Dropout(0.3),
                                                keras.layers.Dense(config.layer_2, name="dense2",kernel_initializer='he_uniform',kernel_regularizer=tf.keras.regularizers.l2(0.001)),
                                                keras.layers.BatchNormalization(name="bn2"),
                                                keras.layers.ReLU(),                                 
                                                keras.layers.Dropout(0.3),                               
                                                keras.layers.Dense(1, name="dense5", activation='sigmoid'),
                                                ])

        # Instantiate a logistic loss function that expects integer targets.
        loss = tf.keras.losses.BinaryCrossentropy()

        # Instantiate an accuracy metric.
        accuracy = tf.keras.metrics.BinaryAccuracy()

        # Instantiate an optimizer.
        optimizer = tf.keras.optimizers.SGD(learning_rate=config.learn_rate,momentum=0.9)

        # configure the optimizer, loss, and metrics to monitor.
        model.compile(optimizer=optimizer, loss=loss, metrics=[accuracy])
        #print(x_train.head(1))
        model.fit(train_x,y_train,batch_size=config.batch_size,
                            epochs=config.epoch,
                            validation_data=(val_x, y_val),
                            callbacks=[WandbCallback(verbose=1,monitor="val_binary_accuracy", mode="max"),
                            EarlyStopping(patience=100)
                            ])

        # predict
        logger.info("Infering")



        p = model.predict(val_x)
        predict = np.round_(p, decimals=0, out=None)
        # Evaluation Metrics
        logger.info("Evaluation metrics")
        fbeta = fbeta_score(y_val, predict, beta=1, zero_division=1)
        precision = precision_score(y_val, predict, zero_division=1)
        recall = recall_score(y_val, predict, zero_division=1)
        acc = accuracy_score(y_val, predict)

        logger.info("Accuracy: {}".format(acc))
        logger.info("Precision: {}".format(precision))
        logger.info("Recall: {}".format(recall))
        logger.info("F1: {}".format(fbeta))

        run.summary["Accuracy"] = acc
        run.summary["Precision"] = precision
        run.summary["Recall"] = recall
        run.summary["F1"] = fbeta

In [None]:
# To see all parameters of a specific step of the pipeline
pipe.named_steps['classifier'].get_params() #or
#pipe.named_steps['full_pipeline'].get_params()

In [None]:
# Initialize a new sweep
# Arguments:
#     – sweep_id: the sweep_id to run - this was returned above by wandb.sweep()
#     – function: function that defines your model architecture and trains it

wandb.agent(sweep_id = sweep_id, function=train,count=5)

In [None]:
run.finish()

#1.7 Restore a model

In [None]:
%%capture
!pip install wandb==0.10.17

In [None]:
!pip install wandb

In [None]:
 import wandb
 wandb.__version__

In [None]:
!wandb login

In [None]:
# restore the raw model file "model-best.h5" from a specific run by user "ivanovitchm"
# in project "lesson04" from run "sqdv5ccj"
best_model = wandb.restore('model-best.h5', run_path="mgoldbarg/diabetes_nn/ky8ntlvi")

In [None]:
# restore the model for tf.keras
model = tf.keras.models.load_model(best_model.name)

In [None]:
model.summary()

In [None]:
y_val

In [None]:
pipe = Pipeline(steps = [('full_pipeline', full_pipeline_preprocessing)])

val_x = pipe.fit_transform(x_val, y_val)

# execute the loss and accuracy using the test dataset
loss_, acc_ = model.evaluate(x=val_x,y=y_val)
print('Test loss: %.3f - acc: %.3f' % (loss_, acc_))

##1.7 Configure and train the best model

In [None]:
# Note that a new run is yet synced to last sweep run
# Just to check
run = wandb.init()

In [None]:
# The full pipeline 
pipe = Pipeline(steps = [('full_pipeline', full_pipeline_preprocessing),
                         ("classifier",DecisionTreeClassifier())
                         ]
                )

# update the parameters of the pipeline that we would like to tuning
pipe.set_params(**{"full_pipeline__num_pipeline__num_transformer__model": 0})
pipe.set_params(**{"classifier__criterion": 'gini'})
pipe.set_params(**{"classifier__splitter": 'random'})
pipe.set_params(**{"classifier__max_depth": 4})
pipe.set_params(**{"classifier__random_state": 41})



# training
logger.info("Training")
pipe.fit(x_train, y_train)

# predict
logger.info("Infering")
predict = pipe.predict(x_val)

# Evaluation Metrics
logger.info("Evaluation metrics")
fbeta = fbeta_score(y_val, predict, beta=1, zero_division=1)
precision = precision_score(y_val, predict, zero_division=1)
recall = recall_score(y_val, predict, zero_division=1)
acc = accuracy_score(y_val, predict)

logger.info("Accuracy: {}".format(acc))
logger.info("Precision: {}".format(precision))
logger.info("Recall: {}".format(recall))
logger.info("F1: {}".format(fbeta))

run.summary["Acc"] = acc
run.summary["Precision"] = precision
run.summary["Recall"] = recall
run.summary["F1"] = fbeta

In [None]:
# float_names is a list
flt_names = full_pipeline_preprocessing.get_params()["flt_pipeline"][1].get_feature_names_out()
flt_names

In [None]:
# Get numerical column names
num_names = pipe.named_steps['full_pipeline'].get_params()["num_pipeline"][1].get_feature_names_out()
num_names

In [None]:
cat_names = full_pipeline_preprocessing.get_params()["cat_pipeline"][3].get_feature_names_out().tolist()
# num_names is a list

In [None]:
# merge all column names together
all_names = num_names + flt_names
all_names

In [None]:
# Visualize all classifier plots
# For a complete documentation please see: https://docs.wandb.ai/guides/integrations/scikit
wandb.sklearn.plot_classifier(pipe.get_params()["classifier"],
                              full_pipeline_preprocessing.transform(x_train),
                              full_pipeline_preprocessing.transform(x_val),
                              y_train,
                              y_val,
                              predict,
                              pipe.predict_proba(x_val),
                              [0,1],
                              model_name='BestModel', feature_names=all_names)

##1.8 Export the best model

In [None]:
# types and names of the artifacts
artifact_type = "inference_artifact"
artifact_encoder = "target_encoder"
artifact_model = "model_export"

In [None]:
logger.info("Dumping the artifacts to disk")
# Save the model using joblib
joblib.dump(pipe, artifact_model)

# Save the target encoder using joblib
#joblib.dump(le, artifact_encoder)

In [None]:
# Model artifact
artifact = wandb.Artifact(artifact_model,
                          type=artifact_type,
                          description="A full pipeline composed of a Preprocessing Stage and a NN model"
                          )

logger.info("Logging model artifact")
artifact.add_file(artifact_model)
run.log_artifact(artifact)

In [None]:
# Target encoder artifact
artifact = wandb.Artifact(artifact_encoder,
                          type=artifact_type,
                          description="The encoder used to encode the target variable"
                          )

logger.info("Logging target enconder artifact")
artifact.add_file(artifact_encoder)
run.log_artifact(artifact)

In [None]:
run.finish()