## https://github.com/BCG-Gamma/facet
. sklearndf
# conda install -c bcg_gamma sklearndf 
# conda install sklearndf -c bcg_gamma -c conda-forge
. gamma-facet
## conda install gamma-facet -c bcg_gamma -c conda-forge



In [2]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# relevant sklearndf imports
from sklearndf.transformation import (
    ColumnTransformerDF,
    OneHotEncoderDF,
    SimpleImputerDF,
)
from sklearndf.pipeline import (
    PipelineDF,
    ClassifierPipelineDF
)
from sklearndf.classification import RandomForestClassifierDF

# load titanic data
titanic_X, titanic_y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# select features
numerical_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']

# create a preprocessing pipeline
preprocessing_numeric_df = SimpleImputerDF(strategy="median")

preprocessing_categorical_df = PipelineDF(
    steps=[
        ('imputer', SimpleImputerDF(strategy='constant', fill_value='Unknown')),
        ('one-hot', OneHotEncoderDF(sparse=False, handle_unknown="ignore"))
    ]
)

preprocessing_df = ColumnTransformerDF(
    transformers=[
        ('categorical', preprocessing_categorical_df, categorical_features),
        ('numeric', preprocessing_numeric_df, numerical_features),
    ]
)

# run preprocessing
transformed_df = preprocessing_df.fit_transform(X=titanic_X, y=titanic_y)
transformed_df.head()

feature_out,embarked_C,embarked_Q,embarked_S,embarked_Unknown,sex_female,sex_male,pclass_1.0,pclass_2.0,pclass_3.0,age,fare
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,29.0,211.3375
1,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.9167,151.55
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,151.55
3,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,30.0,151.55
4,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,25.0,151.55


In [3]:
embarked_type_derivatives = preprocessing_df.feature_names_original_ == "embarked"
transformed_df.loc[:, embarked_type_derivatives].head()

feature_out,embarked_C,embarked_Q,embarked_S,embarked_Unknown
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


In [4]:
# create full pipeline
pipeline_df = ClassifierPipelineDF(
    preprocessing=preprocessing_df,
    classifier=RandomForestClassifierDF(
        n_estimators=1000,
        max_features=2/3,
        max_depth=7,
        random_state=42,
        n_jobs=-3
    )
)

# split data and then fit and score random forest classifier
df_train, df_test, y_train, y_test = train_test_split(titanic_X, titanic_y, random_state=42)
pipeline_df.fit(df_train, y_train)
print(f"model score: {pipeline_df.score(df_test, y_test).round(2)}")

model score: 0.79
