# EDA & Modelling

## Package loading

In [None]:
from EZS_func import *
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import re
import math
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.inspection import permutation_importance
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.gaussian_process.kernels import ConstantKernel
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from pandas.api.types import is_numeric_dtype
from itertools import product
from joblib import dump
from scipy import stats
from sklearn import set_config
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from yellowbrick.model_selection import learning_curve
from yellowbrick.model_selection import feature_importances
from yellowbrick.features import rank1d
from yellowbrick.features import rank2d
from yellowbrick.contrib.missing import MissingValuesBar
from yellowbrick.contrib.missing import MissingValuesDispersion
from yellowbrick.target.feature_correlation import feature_correlation
from yellowbrick.regressor import prediction_error
from yellowbrick.regressor import residuals_plot


## Exploratory Data Analysis 

### File and parameters loading

In [None]:
problem_type = 'regression'

In [None]:
stacking = 'True'

In [None]:
data_size = 'small'

In [None]:
df = pd.read_csv('/home/philippe/Documents/python project/EZStacking-v0.13/dataset/concrete_data.csv')

In [None]:
target_col = 'Strength'

#### Thresholds

In [None]:
threshold_NaN = 0.5

In [None]:
threshold_cat = 5

In [None]:
threshold_Z = 3.0

In [None]:
threshold_corr = 0.95

In [None]:
threshold_model = 5

In [None]:
threshold_score = 0.5

#### Drop user's columns:

In [None]:
user_drop_cols = []

Dataset before deletion

In [None]:
display(df)

In [None]:
df = df.drop(user_drop_cols, axis=1)

Dataset after deletion

In [None]:
display(df)

#### Dataset copy

In [None]:
df_copy = df.copy()

### Dataset Information

In [None]:
df.shape

#### Some records

In [None]:
display(df)

#### Dataframe structure

In [None]:
plot_dataframe_structure(df)

#### Dataframe statistics

In [None]:
display(df.describe().T)

#### Which columns could be categorical ?

In [None]:
plot_categorical(df)

### Dataset Cleaning

#### Duplicate rows:

In [None]:
duplicates(df)

#### Drop NaN:

In [None]:
df, drop_cols = drop_na(df, threshold_NaN)

Final set of dropped columns

In [None]:
dropped_cols = np.unique(np.concatenate((drop_cols, user_drop_cols)))

In [None]:
display(dropped_cols)

#### Encoding data:

In [None]:
df, encoded_cols = encoding(df, threshold_cat, target_col)

#### Imputing NaN using IterativeImputer

In [None]:
visualizer = MissingValuesBar(features=df.select_dtypes(include=np.number).columns.tolist())
visualizer.fit(df.select_dtypes(include=np.number))
visualizer.show();

##### Imputation

In [None]:
df = imputation(df)

#### Data compression:

In [None]:
df = downcast_dtypes(df)

##### Dataframe structure after compression

In [None]:
plot_dataframe_structure(df)

#### Outliers:

In [None]:
df = outliers(df, threshold_Z)

#### Splitting dataframe in features and targets

In [None]:
y = df[target_col]

In [None]:
X = df.drop(target_col, axis=1)

### Plottings

#### Ranking 

##### Ranking 1D 

In [None]:
rank1d(df);

##### Ranking 2D 

###### Ranking 2D according to Pearson

In [None]:
rank2d(df, algorithm='pearson');

###### Ranking 2D based on covariance

In [None]:
rank2d(df, algorithm='covariance');

###### Ranking 2D according to Spearman

In [None]:
rank2d(df, algorithm='spearman');

###### Ranking 2D according to Kendalltau

In [None]:
rank2d(df, algorithm='kendalltau');

#### Correlation

##### Correlation with Yellow Bricks

In [None]:
feature_correlation(X, y);

In [None]:
feature_correlation(X, y, method='mutual_info-regression');

#### Feature importance (a priori)

##### According to decision tree

In [None]:
feature_importances(DecisionTreeRegressor(), X, y);

##### According to elasticnet regression

In [None]:
feature_importances(ElasticNet(alpha=0.01, l1_ratio=0.5), X, y);

#### Check columns that should be dropped

In [None]:
print(dropped_cols)

## Splittings 

In [None]:
df = df_copy

#### Splitting dataframe in features and targets

In [None]:
y = df[target_col]

In [None]:
X = df.drop(target_col, axis=1)

##### Dimensions

In [None]:
nb_features = len(X.columns.tolist())

In [None]:
nb_targets = 1

In [None]:
layer_size = nb_features + nb_targets + 2

##### Splitting data in train and test sets 

In [None]:
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33)

## Modelling

### Model building

#### Pipeline building

##### Select the categorical and numerical columns

In [None]:
cat_selector = make_column_selector(dtype_include=object)

In [None]:
num_selector = make_column_selector(dtype_include=np.number)

##### For models based on tree

In [None]:
cat_tree_processor = make_pipeline(SimpleImputer(strategy='most_frequent'), OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), Decorrelator(threshold_corr))

In [None]:
num_tree_processor = make_pipeline(SimpleImputer(strategy='mean', add_indicator=True), Decorrelator(threshold_corr))

In [None]:
tree_preprocessor = make_column_transformer((num_tree_processor, num_selector), (cat_tree_processor, cat_selector))

##### For models not based on tree

In [None]:
cat_ntree_processor = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore', sparse=False), Decorrelator(threshold_corr))

In [None]:
num_ntree_processor = make_pipeline(SimpleImputer(strategy='mean', add_indicator=True), StandardScaler(), Decorrelator(threshold_corr))

In [None]:
ntree_preprocessor = make_column_transformer((num_ntree_processor, num_selector), (cat_ntree_processor, cat_selector))

##### Level-0 models

In [None]:
level_0 = [ 
          ('GPRL', make_pipeline(ntree_preprocessor, GaussianProcessRegressor(kernel = ConstantKernel() * DotProduct() + ConstantKernel() + WhiteKernel()))), 
          ('GPRR', make_pipeline(ntree_preprocessor, GaussianProcessRegressor(kernel = ConstantKernel() * RBF() + ConstantKernel() + WhiteKernel()))), 
          ('DTR', make_pipeline(tree_preprocessor, DecisionTreeRegressor(max_depth=5))), 
          ('RFR', make_pipeline(tree_preprocessor, RandomForestRegressor(max_depth=5, n_estimators=10, max_features='auto'))), 
          ('ABR', make_pipeline(tree_preprocessor, AdaBoostRegressor())), 
          ('HGBR', make_pipeline(tree_preprocessor, HistGradientBoostingRegressor(early_stopping=True))), 
          ('ELNE', make_pipeline(ntree_preprocessor, ElasticNet(alpha=0.01, l1_ratio=0.5))), 
          ('ELNECV', make_pipeline(ntree_preprocessor, ElasticNetCV(cv=5))), 
          ('LINR', make_pipeline(ntree_preprocessor, LinearRegression())), 
          ('MLPR', make_pipeline(ntree_preprocessor, MLPRegressor(hidden_layer_sizes = (layer_size, layer_size,), max_iter=1000, early_stopping=True))), 
          ('KNR', make_pipeline(ntree_preprocessor, KNeighborsRegressor())), 
          ]

##### Level-1 model

In [None]:
level_1 = ElasticNetCV(cv=5)

##### Stacking for regression

In [None]:
model = StackingRegressor(level_0, final_estimator=level_1, n_jobs=-1)

### Model fitting

In [None]:
%%time 
set_config(display='diagram') 
model.fit(X_train, y_train)

### Model evaluation

#### Model scoring

In [None]:
score_stack, mod_imp_score = score_stacking(model, X_train, y_train, X_test, y_test)

#### Model importance

In [None]:
model_imp = plot_model_importance(model)

#### Feature permutation importance (a posteriori)

In [None]:
plot_perm_importance(model, X_test, y_test)

#### Partial Dependence & Individual Conditional Expectation 

##### Features of interest

In [None]:
features_of_interest = []

In [None]:
plot_partial_dependence(model, X_train, features_of_interest)

#### Residuals plot

In [None]:
residuals_plot(model, X_train, y_train, X_test, y_test);

#### Prediction error

In [None]:
prediction_error(model, X_train, y_train, X_test, y_test);

### Final Model

In [None]:
threshold_model = 5

In [None]:
threshold_score = 0.5

##### Filtered Level-0 models

In [None]:
level_0_f = model_filtering(level_0, model_imp, threshold_model, score_stack, threshold_score)

##### Final Level-1 model

In [None]:
level_1 = ElasticNetCV(cv=5)

##### Build final model

In [None]:
model = StackingRegressor(level_0_f, final_estimator=level_1, n_jobs=-1)

### Final Model fitting

In [None]:
%%time 
set_config(display='diagram') 
model.fit(X_train, y_train)

#### Final Model scoring

In [None]:
score_stacking(model, X_train, y_train, X_test, y_test)

#### Final feature permutation importance

In [None]:
plot_perm_importance(model, X_test, y_test)

#### Final Residuals plot

In [None]:
residuals_plot(model, X_train, y_train, X_test, y_test);

#### Final Prediction error

In [None]:
prediction_error(model, X_train, y_train, X_test, y_test);

## Deployment

### Save your model

In [None]:
model_name = 'model.sav' 
dump(model, model_name)

### Generate your server

In [None]:
fastapi_server(model, 'model.sav', X, y)

### Start your server

In [None]:
run server.py

### Test your API at http://127.0.0.1:8000/docs