In [None]:
import mlflow.sklearn
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC

from pandas.api.types import CategoricalDtype

mlflow.sklearn.autolog()

Some helper functions for procesing the features from the tables.

In [None]:
def encode(df):
    """Encode categorical variables."""

    ordered_categories = {
        'Home Ownership': ['MORTGAGE', 'RENT', 'OWN'],
        'Income Verification Status': ['Not Verified', 'Partially Verified', 'Verified']
    }
    unordered_categories = ['State', 'Loan Purpose', 'Due Settlement', 'Payment Plan']

    # Nominal categories
    for name in unordered_categories:
        df[name] = df[name].astype("category")
    # Ordinal categories
    for name, levels in ordered_categories.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df


def read_table_and_encode(table_name):
    spk_df = spark.read.table(table_name)
    df = spk_df.toPandas()
    return encode(df)

Loading the data and creating the pipeline for preprocessing.

In [None]:
encoded_test = read_table_and_encode("Test")
encoded_train = read_table_and_encode("Train")

y_test = encoded_test.pop('Approve Loan')
X_test = encoded_test.drop(columns=['id'])

y_train = encoded_train.pop('Approve Loan')
X_train = encoded_train.drop(columns=['id'])

categorical_vars = X_train.dtypes[X_train.dtypes == 'category'].index.to_list()
numerical_vars = [col for col in X_train.columns if col not in categorical_vars]

numeric_preprocessor = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
    ]
)

categorical_preprocessor = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, categorical_vars),
        ("numerical", numeric_preprocessor, numerical_vars),
    ]
)

Create the preprocessing and training pipeline and run the tests.

In [None]:
pipe = make_pipeline(preprocessor, LogisticRegression())
with mlflow.start_run():
    pipe.fit(X_train, y_train).score(X_test, y_test)

                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['State', 'Income Verification Status',
                                  'Home Ownership', 'Loan Purpose',
                                  'Due Settlement', 'Payment Plan']),
                                ('numerical',
                ...`
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['State', 'Income Verification Status',
                                  'Home Ownership', 'Loan Purpose',
                                  'Due Settlement', 'Payment Plan']),
                                ('numerical',
                                 Pipeli...`
