# EDA & Modelling

## Package loading

In [None]:
from EZS_func import *
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import re
import math
from pandas.api.types import is_numeric_dtype
from itertools import product
from scipy import stats
from sklearn import set_config
from sklearn import datasets
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.inspection import permutation_importance
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from yellowbrick.model_selection import learning_curve
from yellowbrick.model_selection import feature_importances
from yellowbrick.features import rank1d
from yellowbrick.features import rank2d
from yellowbrick.regressor import prediction_error
from yellowbrick.regressor import residuals_plot
from yellowbrick.contrib.missing import MissingValuesBar
from yellowbrick.contrib.missing import MissingValuesDispersion
from yellowbrick.target.feature_correlation import feature_correlation
from polylearn import PolynomialNetworkRegressor
from polylearn import FactorizationMachineRegressor


## Exploratory Data Analysis 

### File and parameters loading

In [None]:
problem_type = 'regression'

In [None]:
stacking = 'True'

In [None]:
data_size = 'small'

In [None]:
df = pd.read_csv('/home/philippe/Documents/python project/EZStacking-v0.6/dataset/fmri.csv')

In [None]:
target_col = 'signal'

In [None]:
user_drop_cols = []

In [None]:
threshold_NaN = 0.5

In [None]:
threshold_cat = 5

In [None]:
threshold_Z = 3.0

### Dataset Information

In [None]:
df.shape

#### Some records

In [None]:
display(df)

#### Dataframe structure

In [None]:
plot_dataframe_structure(df)

#### Dataframe statistics

In [None]:
display(df.describe().T)

#### Which columns could be categorical ?

In [None]:
plot_categorical(df)

### Dataset Cleaning

#### Duplicate rows:

In [None]:
duplicates(df)

#### Drop user's columns:

In [None]:
print('user_drop_cols = ', user_drop_cols)

In [None]:
df = df.drop(user_drop_cols, axis=1)

#### Drop NaN:

In [None]:
df, drop_cols = drop_na(df, threshold_NaN)

Final set of dropped columns

In [None]:
dropped_cols = np.unique(np.concatenate((drop_cols, user_drop_cols)))

In [None]:
display(dropped_cols)

#### Encoding data:

In [None]:
df, encoded_cols = encoding(df, threshold_cat)

#### Imputing NaN using IterativeImputer

In [None]:
visualizer = MissingValuesBar(features=df.select_dtypes(include=np.number).columns.tolist())
visualizer.fit(df.select_dtypes(include=np.number))
visualizer.show();

##### Imputation

In [None]:
df = imputation(df)

#### Data compression:

In [None]:
df = downcast_dtypes(df)

##### Dataframe structure after compression

In [None]:
plot_dataframe_structure(df)

#### Outliers:

In [None]:
df = outliers(df, threshold_Z)

#### Splitting dataframe in features and targets

In [None]:
y = df[target_col]

In [None]:
X = df.drop(target_col, axis=1)

### Plottings

#### Ranking 

##### Ranking 1D 

In [None]:
rank1d(df);

##### Ranking 2D 

###### Ranking 2D according to Pearson

In [None]:
rank2d(df, algorithm='pearson');

###### Ranking 2D based on covariance

In [None]:
rank2d(df, algorithm='covariance');

###### Ranking 2D according to Spearman

In [None]:
rank2d(df, algorithm='spearman');

###### Ranking 2D according to Kendalltau

In [None]:
rank2d(df, algorithm='kendalltau');

#### Correlation

##### Correlation with Yellow Bricks

In [None]:
feature_correlation(X, y);

In [None]:
feature_correlation(X, y, method='mutual_info-regression');

#### Feature importance (a priori)

##### According to decision tree

In [None]:
feature_importances(DecisionTreeRegressor(), X, y);

##### According to elasticnet regression

In [None]:
feature_importances(ElasticNet(alpha=0.01, l1_ratio=0.5), X, y);

## Splitting data in train and test sets 

In [None]:
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33)

## Modelling

### Model building

##### Level-0 models

In [None]:
level_0 = [ 
          ('GPR', GaussianProcessRegressor()), 
          ('DTR', DecisionTreeRegressor(max_depth=5)), 
          ('RFR', RandomForestRegressor(max_depth=5, n_estimators=10, max_features='auto')), 
          ('ABR', AdaBoostRegressor()), 
          ('ELNE', ElasticNet(alpha=0.01, l1_ratio=0.5)), 
          ('LINR', LinearRegression()), 
          ('MLPR', MLPRegressor(alpha=1, max_iter=1000, early_stopping=True)), 
          ('KNR', KNeighborsRegressor()), 
          ('PNR', PolynomialNetworkRegressor()), 
          ('FMR', FactorizationMachineRegressor()), 
          ]

##### Level-1 model

In [None]:
level_1 = MLPRegressor()

##### Stacking for regression

In [None]:
model = StackingRegressor(level_0, final_estimator=level_1)

### Model fitting

In [None]:
%%time
set_config(display='diagram') 
model.fit(X_train, y_train)

### Model evaluation

#### Model scoring

In [None]:
score_stacking_r(model, X_train, y_train, X_test, y_test)

#### Feature permutation importance (a posteriori)

In [None]:
plot_perm_imp(model, X, y, scoring = 'r2');

#### Residuals plot

In [None]:
residuals_plot(model, X_train, y_train, X_test, y_test);

#### Prediction error

In [None]:
prediction_error(model, X_train, y_train, X_test, y_test);