# EDA & modelization

## Loading main packages 

In [None]:
from EZS_func import *
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import re
import math
from pandas.api.types import is_numeric_dtype
from itertools import product
from scipy import stats
from sklearn import set_config
from sklearn import datasets
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.inspection import permutation_importance
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from polylearn import PolynomialNetworkClassifier
from polylearn import FactorizationMachineClassifier


## Exploratory Data Analysis 

### File and parameters loading

In [None]:
problem_type = 'classification'

In [None]:
stacking = 'False'

In [None]:
data_size = 'small'

In [None]:
df = pd.read_csv('/home/philippe/Documents/python project/EZStacking-v0.5/dataset/iris.csv')

In [None]:
target_col = 'variety'

In [None]:
user_drop_cols = []

In [None]:
threshold_NaN = 0.5

In [None]:
threshold_cat = 5

In [None]:
threshold_Z = 3.0

### Dataset Information

In [None]:
df.shape

Some records

In [None]:
display(df)

### Dataframe structure

In [None]:
plot_dataframe_structure(df)

### Dataframe statistics

In [None]:
display(df.describe().T)

Which columns could be categorical ?

In [None]:
plot_categorical(df)

### Dataset Cleaning

#### Duplicate rows:

In [None]:
duplicates(df)

#### Drop user's columns:

In [None]:
print('user_drop_cols = ', user_drop_cols)

In [None]:
df = df.drop(user_drop_cols, axis=1)

#### Drop NaN:

In [None]:
df, drop_cols = drop_na(df, threshold_NaN)

Final set of dropped columns

In [None]:
dropped_cols = np.unique(np.concatenate((drop_cols, user_drop_cols)))

In [None]:
display(dropped_cols)

#### Encoding data:

In [None]:
df, encoded_cols = encoding(df, threshold_cat)

#### Imputing NaN using IterativeImputer

##### Imputation

In [None]:
df = imputation(df)

#### Data compression:

In [None]:
df = downcast_dtypes(df)

Dataframe structure after compression

In [None]:
plot_dataframe_structure(df)

#### Outliers:

In [None]:
df = outliers(df, threshold_Z)

#### Splitting dataframe in features and targets

In [None]:
y = df[target_col]

In [None]:
X = df.drop(target_col, axis=1)

## Splitting data in train and test sets 

In [None]:
X_train, X_test, y_train, y_test = split(X, y, test_size=0.33)

## Modeling

### Building the model

Simple model

In [None]:
model = LogisticRegression()

Possible other models :

In [None]:
# GaussianProcessClassifier()
# DecisionTreeClassifier(max_depth=5)
# RandomForestClassifier(max_depth=5, n_estimators=10, max_features='auto')
# AdaBoostClassifier()
# LogisticRegression()
# MLPClassifier(alpha=1, max_iter=1000, early_stopping=True)
# KNeighborsClassifier(n_neighbors=len(y.unique()))
# GaussianNB()


### Model fitting

In [None]:
%%time
set_config(display='diagram') 
model.fit(X_train, y_train)

### Model evaluation

#### Model scoring

In [None]:
print('Score on train set:', model.score(X_train, y_train))

In [None]:
print('Score on test set:', model.score(X_test, y_test))

#### Feature permutation importance (a posteriori)

In [None]:
plot_perm_imp(model, X, y, scoring = 'accuracy');