# EDA & Modelling

## Package loading

In [None]:
from EZS_deps.EZS_func import *
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import re
import math
import keras
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.inspection import permutation_importance
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.gaussian_process.kernels import ConstantKernel
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn import set_config
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import GaussianNB
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping
from keras.layers import Dense
from keras.layers import BatchNormalization
from keras.layers import LayerNormalization
from keras.layers import Dropout
from keras.models import Sequential
from keras.utils import np_utils
from keras.utils.vis_utils import plot_model
from pandas.api.types import is_numeric_dtype
from itertools import product
from joblib import dump
from scipy import stats
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer


## Exploratory Data Analysis 

### File and parameters loading

In [None]:
problem_type = 'classification'

In [None]:
data_size = 'small'

In [None]:
df = pd.read_csv('/home/philippe/development/python/EZStacking/dataset/iris.csv')

In [None]:
target_col = 'variety'

#### Thresholds

In [None]:
threshold_NaN = 0.5

In [None]:
threshold_cat = 5

In [None]:
threshold_Z = 3.0

In [None]:
test_size = 0.33

In [None]:
threshold_entropy = 0.75

In [None]:
undersampling = False

In [None]:
undersampler = 'Random'

In [None]:
threshold_corr = 0.95

In [None]:
threshold_model = 5

In [None]:
threshold_score = 0.7

In [None]:
threshold_feature = 5

In [None]:
CPU = False

In [None]:
level_1_model = 'regression'

#### Drop user's columns:

In [None]:
user_drop_cols = []

Dataset before deletion

In [None]:
display(df)

In [None]:
df = df.drop(user_drop_cols, axis=1)

Dataset after deletion

In [None]:
display(df)

#### Dataset copy

In [None]:
df_copy = df.copy()

### Dataset Information

In [None]:
df.shape

#### Some records

In [None]:
display(df)

#### Dataframe structure

In [None]:
plot_dataframe_structure(df)

#### Dataframe statistics

In [None]:
display(df.describe().T)

#### Which columns could be categorical ?

In [None]:
plot_categorical(df)

### Dataset Cleaning

#### Duplicate rows:

In [None]:
duplicates(df)

#### Drop NaN:

In [None]:
df, drop_cols = drop_na(df, threshold_NaN)

Set of dropped columns: NaN

In [None]:
dropped_cols = np.unique(np.concatenate((drop_cols, user_drop_cols)))

In [None]:
display(dropped_cols)

#### Encoding data:

In [None]:
df, encoded_cols = encoding(df, threshold_cat, target_col)

#### Imputing NaN using IterativeImputer

##### Imputation

In [None]:
df = imputation(df)

#### Data compression:

In [None]:
df = downcast_dtypes(df)

##### Dataframe structure after compression

In [None]:
plot_dataframe_structure(df)

#### Outliers:

In [None]:
df = outliers(df, threshold_Z)

#### Correlation

In [None]:
corr = df.corr() 
corr.style.background_gradient(cmap='coolwarm')

In [None]:
correlated_features = correlated_columns(df, threshold_corr, target_col) 
dropped_cols = np.unique(np.concatenate((drop_cols, correlated_features)))

#### Check columns that should be dropped

In [None]:
print(dropped_cols)

## Splittings 

In [None]:
df = df_copy

#### Splitting dataframe in features and targets

In [None]:
y = df[target_col]

In [None]:
X = df.drop(target_col, axis=1)

##### Dimensions

In [None]:
nb_features = len(X.columns.tolist())

In [None]:
nb_targets = len(y.unique())

In [None]:
layer_size = nb_features + nb_targets + 2

##### Splitting data in train and test sets 

In [None]:
X_train, X_test, y_train, y_test = split(X, y, test_size=test_size, threshold_entropy=threshold_entropy, undersampling= undersampling, undersampler= undersampler)

##### Encoding

In [None]:
target_encoder = LabelEncoder() 
y_train = pd.Series(target_encoder.fit_transform(y_train)) 
y_test = pd.Series(target_encoder.transform(y_test))

## Modelling

### Model building

#### Pipeline building

##### Select the categorical and numerical columns

In [None]:
cat_selector = make_column_selector(dtype_include=object)

In [None]:
num_selector = make_column_selector(dtype_include=np.number)

##### For models based on tree

In [None]:
cat_tree_processor = make_pipeline(SimpleImputer(strategy='most_frequent'), OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))

In [None]:
num_tree_processor = make_pipeline(IterativeImputer(random_state=0, add_indicator=True))

In [None]:
tree_preprocessor = make_pipeline(make_column_transformer((num_tree_processor, num_selector), (cat_tree_processor, cat_selector)), Decorrelator(threshold_corr))

##### For models not based on tree

In [None]:
cat_ntree_processor = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore', sparse=False))

In [None]:
num_ntree_processor = make_pipeline(IterativeImputer(random_state=0, add_indicator=True), StandardScaler())

In [None]:
ntree_preprocessor = make_pipeline(make_column_transformer((num_ntree_processor, num_selector), (cat_ntree_processor, cat_selector)), Decorrelator(threshold_corr))

##### Keras neural network definition

In [None]:
def K_Class(): 
    keras.backend.clear_session() 
#   neural network architecture: start 
    model = Sequential() 
    model.add(BatchNormalization()) 
    model.add(Dense(layer_size, activation='selu')) 
#    model.add(LayerNormalization()) 
    model.add(BatchNormalization()) 
    model.add(Dropout(0.2)) 
    model.add(Dense(nb_targets, activation='softmax')) 
#   neural network architecture: end   
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


##### Early Stopping

In [None]:
es = EarlyStopping(monitor='val_loss', mode='auto', verbose=1, patience=20)

In [None]:
K_C = KerasClassifier(K_Class, batch_size=64, epochs=2000, callbacks=[es], validation_split=0.1, verbose=1) 
K_C._estimator_type = 'classifier'

##### Level-0 models

In [None]:
level_0 = [ 
          ('GPCL', make_pipeline(ntree_preprocessor, GaussianProcessClassifier(kernel = ConstantKernel() * DotProduct() + ConstantKernel() + WhiteKernel()))), 
          ('GPCR', make_pipeline(ntree_preprocessor, GaussianProcessClassifier(kernel = ConstantKernel() * RBF() + ConstantKernel() + WhiteKernel()))), 
          ('GPCQ', make_pipeline(ntree_preprocessor, GaussianProcessClassifier(kernel = ConstantKernel() * RationalQuadratic() + ConstantKernel() + WhiteKernel()))), 
          ('DTCG', make_pipeline(tree_preprocessor, DecisionTreeClassifier(criterion='gini'))), 
          ('DTCE', make_pipeline(tree_preprocessor, DecisionTreeClassifier(criterion='entropy'))), 
          ('RFCG', make_pipeline(tree_preprocessor, RandomForestClassifier(criterion='gini', n_estimators=50))), 
          ('RFCE', make_pipeline(tree_preprocessor, RandomForestClassifier(criterion='entropy', n_estimators=50))), 
          ('ABC', make_pipeline(tree_preprocessor, AdaBoostClassifier())), 
          ('LOGRL', make_pipeline(ntree_preprocessor, LogisticRegression(solver='lbfgs', penalty='l2'))), 
          ('LOGRS', make_pipeline(ntree_preprocessor, LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.15))), 
          ('LOGRLCV', make_pipeline(ntree_preprocessor, LogisticRegressionCV(cv=5, solver='lbfgs', penalty='l2'))), 
          ('LOGRSCV', make_pipeline(ntree_preprocessor, LogisticRegressionCV(cv=5, solver='saga', penalty='l2'))), 
          ('MLPC10', make_pipeline(ntree_preprocessor, MLPClassifier(hidden_layer_sizes = (10 * layer_size, 10 * layer_size,), max_iter=1000, early_stopping=True))), 
          ('MLPC20', make_pipeline(ntree_preprocessor, MLPClassifier(hidden_layer_sizes = (20 * layer_size, 20 * layer_size,), max_iter=1000, early_stopping=True))), 
          ('KNCU', make_pipeline(ntree_preprocessor, KNeighborsClassifier(weights='uniform', n_neighbors=len(y.unique())))), 
          ('KNCD', make_pipeline(ntree_preprocessor, KNeighborsClassifier(weights='distance', n_neighbors=len(y.unique())))), 
          ('GNB', make_pipeline(ntree_preprocessor, GaussianNB())), 
          ('KERC', make_pipeline(ntree_preprocessor, K_C)), 
          ]

##### Level-1 model

In [None]:
level_1 = LogisticRegression()

##### Stacking for classification

In [None]:
model = StackingClassifier(level_0, final_estimator=level_1)

### Model fitting

In [None]:
%%time 
set_config(display='diagram') 
model.fit(X_train, y_train)

### Model evaluation

#### Model scoring

In [None]:
score_stack, mod_imp_score = score_stacking(model, X_train, y_train, X_test, y_test)

#### Model importance

In [None]:
model_imp = plot_model_importance(model, level_1_model)

#### Feature permutation importance (a posteriori)

In [None]:
plot_perm_importance(model, X_test, y_test, CPU)

#### Partial Dependence & Individual Conditional Expectation 

##### Features of interest

In [None]:
features_of_interest = []

In [None]:
plot_partial_dependence(model, X_test, features_of_interest, CPU)

#### Classification report

In [None]:
K_classification_report(model, X_train, y_train, X_test, y_test)

#### Confusion matrix

In [None]:
K_confusion_matrix(model, X_train, y_train, X_test, y_test)

### Final Model

In [None]:
threshold_model = 5

In [None]:
threshold_score = 0.7

##### Filtered Level-0 models

In [None]:
level_0_f = model_filtering(level_0, model_imp, threshold_model, score_stack, threshold_score)

##### Final Level-1 model

In [None]:
level_1 = LogisticRegression()

##### Build final model

In [None]:
model = StackingClassifier(level_0_f, final_estimator=level_1)

### Final Model fitting

In [None]:
%%time 
set_config(display='diagram') 
model.fit(X_train, y_train)

#### Final Model scoring

In [None]:
score_stacking(model, X_train, y_train, X_test, y_test)

#### Final feature permutation importance

In [None]:
feature_importance = plot_perm_importance(model, X_test, y_test, CPU)

##### Final feature elimination

In [None]:
best_feature, worst_feature = feature_filtering(feature_importance, threshold_feature)

In [None]:
dropped_cols = np.unique(np.concatenate((dropped_cols, worst_feature))).tolist()

#### Check those columns, they should be dropped

In [None]:
print(dropped_cols)

#### Final Classification report

In [None]:
K_classification_report(model, X_train, y_train, X_test, y_test)

#### Final Confusion matrix

In [None]:
K_confusion_matrix(model, X_train, y_train, X_test, y_test)

## Deployment

### Save your model

In [None]:
model_name = 'model.sav' 
dump(model, model_name)

### Generate your server

In [None]:
fastapi_server(model, 'model.sav', X, y)

### [Test your API](./server.ipynb)