# Modelling: Creating a Pipeline

In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
ordinal_categorical_features = ['WDK_ID', 'MAXSNELHD', 'ZAD_ID', 'LGD_ID']
nominal_categorical_features = [feature for feature in X.columns if feature not in ordinal_categorical_features]


nominal_categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='afwezig')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

ordinal_categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='afwezig')),
    ('encoder', OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat_ordinal', ordinal_categorical_transformer, ordinal_categorical_features),
        ('cat_nominal', nominal_categorical_transformer, nominal_categorical_features)
    ])

# Encode target values
target_enc = LabelEncoder()
target_enc.fit_transform(y)

# Encode features
# preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)


model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, verbose=1))
])

model.fit(X_train, y_train)

# clf.fit(X_train, y_train)

# Data Visualisatie

#### Frequenty plots per klasse

In [None]:
# def count_plot(df, columns, target):   
#     fig, axs = plt.subplots(nrows=5, ncols=5, figsize=(30,50))
#     for col, ax in zip(columns, axs.flatten()):
#         g = sb.countplot(data=df, x = df[col], hue= df[target], ax=ax)
#         g.set_yscale("log")
#         # Put a legend to the right side
#         # g.legend(loc='center left', bbox_to_anchor=(1, 0.5))
#         ax.tick_params(axis='x', rotation=55)
#     fig.tight_layout()
# #     margin = m/fig.gcf().get_size_inches()[0]
# #     fig.gcf().subplots_adjust(left=margin, right=1.-margin)
        
# selectie = ['JAAR_VKL', 'ANTL_PTJ', 'AOL_ID', 'BEBKOM', 'OTE_ID',
#              'IND_ALC', 'MAXSNELHD', 'WDK_ID', 'WGD_CODE_1', 'WGD_CODE_2',
#              'WSE_ID', 'WVG_ID', 'WVL_ID', 'LGD_ID', 'ZAD_ID',
#              'BZD_ID_IF1','BZD_ID_IF2', 'BZD_ID_TA1', 'BZD_ID_TA2', 'BZD_ID_VM1',
#              'BZD_ID_VM2','BZD_ID_VM3', 'NIVEAUKOP', 'GME_NAAM', 'PVE_NAAM']

In [None]:
# count_plot(df_TOTAAL, selectie, 'AP3_CODE')

### Impute Missing Data
>``We markeren de ontbrekende waarden als een losstaande categorie, hierdoor kunnen we een groot deel van de data nog steeds gebruiken``

In [None]:
imputer = SimpleImputer(strategy='constant', fill_value='afwezig')
features = pd.DataFrame(imputer.fit_transform(X), index=X.index, columns=X.columns)

features.info()

### Feature Encoding

>#### Ordinal Encoding
> Bij ordinale encoding zijn er 
>> **WDK_ID**:
>> - droog
>> - nat
>> - sneeuw/ijzel
>
>> **MAXSNELHD**:
>> - 50 km/u
>> - 60 km/u
>> - 70 km/u
>> - 80 km/u
>> - 90 km/u
>> - 100 km/u
>> - 120 km/u
>> - 130 km/u
> 
>> **ZAD_ID**:
>> - Onbeperkt (meer dan 200m)
>> - Beperkt (tot 200m)
>> - Ernstig beperkt (tot 50m)

>> **LGD_ID**:
>> - Daglicht
>> - Schemer
>> - Duisternis

> #### One-Hot Encoding
>> **De Rest**
>> - ``De rest van de categorische waarden zijn discreet, dit betekend dat er geen relatie is tussen de variabelen``
>> - ``Een One-Hot Encoding tranformeerd alle nominale categorische features naar binair, hierdoor krijgen we een heleboel nieuwe columnnen.``
