In [1]:
import os

import polars as pl
import mlflow
import pandas as pd
import logging
import shap
import numpy as np
import random
import pathlib
import lightgbm as lgb
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler
from sklearn.compose import make_column_transformer
from typing import Callable

from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.utils import compute_sample_weight
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from fraud_detection.preprocessing.training import preprocess_data_for_training

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pl.Config(set_fmt_float="full")
pd.options.display.float_format = '{:.3f}'.format
# pd.options.plotting.backend = "matplotlib"

os.chdir(pathlib.Path("/home/paolo/git/fraud-detection"))

mlflow.set_tracking_uri("./mlflow_runs")

In [3]:
## Disable logging warnings
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    logger.disabled = True
    logger.propagate = False

In [4]:
SEED = 42
LOG_MODEL = False

In [5]:
from sklearn.base import BaseEstimator
from sklearn.metrics import average_precision_score, roc_auc_score, ConfusionMatrixDisplay, PrecisionRecallDisplay, \
    RocCurveDisplay, confusion_matrix, brier_score_loss
from sklearn.calibration import CalibrationDisplay, CalibratedClassifierCV


def set_seed(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)

def retrieve_categorical_columns_to_encode(dataframe: pd.DataFrame) -> tuple[list[str], list[str]]:
    one_hot_encoded_columns: list[str] = []
    ordinal_encoded_columns: list[str] = []

    for col in dataframe.select_dtypes("category").columns:
        if dataframe[col].nunique() <= 5:
            one_hot_encoded_columns.append(col)
        else:
            ordinal_encoded_columns.append(col)

    return one_hot_encoded_columns, ordinal_encoded_columns

def store_metrics(y_valid: pd.Series, y_pred: pd.Series, y_pred_proba: pd.Series, valid_sample_weight: np.ndarray) -> dict[str, float]:
    fig = ConfusionMatrixDisplay.from_predictions(y_true=y_valid, y_pred=y_pred, normalize="all", sample_weight=valid_sample_weight)
    mlflow.log_figure(fig.figure_, artifact_file="confusion_matrix.png")

    fig = PrecisionRecallDisplay.from_predictions(y_true=y_valid, y_pred=y_pred_proba, sample_weight=valid_sample_weight)
    mlflow.log_figure(fig.figure_, artifact_file="precision_recall_curve.png")

    fig = RocCurveDisplay.from_predictions(y_true=y_valid, y_pred=y_pred_proba, sample_weight=valid_sample_weight)
    mlflow.log_figure(fig.figure_, artifact_file="roc_curve.png")

    fig = CalibrationDisplay.from_predictions(y_true=y_valid, y_prob=y_pred_proba, n_bins=10)
    mlflow.log_figure(fig.figure_, artifact_file="calibration_curve.png")

    conf_matrix: np.ndarray = confusion_matrix(y_pred=y_pred, y_true=y_valid, labels=[0,1], sample_weight=valid_sample_weight)
    tn, fp, fn, tp = conf_matrix.ravel()

    epsilon = 1e-15

    # Calculate metrics
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    specificity = tn / (tn + fp + epsilon)
    accuracy = (tp + tn) / (tp + tn + fp + fn + epsilon)
    balanced_accuracy = (recall + specificity) / 2
    f1 = (2 * precision * recall) / (precision + recall + epsilon)
    false_positive_rate = fp / (fp + tn + epsilon)
    false_negative_rate = fn / (fn + tp + epsilon)
    mcc = (tp * tn - fp * fn) / (((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5 + epsilon)
    fowlkes_mallows_index = tp / (((tp + fp) * (tp + fn)) ** 0.5 + epsilon)
    roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred_proba, sample_weight=valid_sample_weight, average="weighted")
    average_precision = average_precision_score(y_true=y_valid, y_score=y_pred_proba, sample_weight=valid_sample_weight, average="weighted")
    brier_loss = brier_score_loss(y_true=y_valid, y_prob=y_pred_proba, sample_weight=valid_sample_weight)

    metrics = {
        "accuracy": round(accuracy, 3),
        "balanced_accuracy": round(balanced_accuracy, 3),
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "specificity": round(specificity, 3),
        "f1": round(f1, 3),
        "false_positive_rate": round(false_positive_rate, 3),
        "false_negative_rate": round(false_negative_rate, 3),
        "average_precision": round(average_precision, 3),
        "mcc": round(mcc, 3),
        "fowlkes_mallows_index": round(fowlkes_mallows_index, 3),
        "auc": round(roc_auc, 3),
        "brier_loss": round(brier_loss, 3)
    }

    mlflow.log_metrics(metrics)

    return metrics

def compute_feature_importances(pipeline, X_train, X_valid) -> pd.DataFrame:
    shap_values = shap.LinearExplainer(pipeline["model"], X_train, nsamples=100_000, seed=SEED).shap_values(X_valid)

    feature_importances: pd.DataFrame = pd.DataFrame(data={"column": list(X_valid.columns), "importance": np.mean(shap_values, axis=0)}).sort_values(by="importance", ascending=True)
    mlflow.log_table(feature_importances, "feature_importances.json")

    # fig = px.histogram(feature_importances, x="importance", y="column", orientation="h", width=1500, height=1500)
    # mlflow.log_figure(fig, artifact_file="feature_importances.png")

    return feature_importances

def compute_threshold(X: pd.DataFrame, y: np.ndarray, model: BaseEstimator, sample_weight: np.ndarray) -> tuple[float, float]:
    thresholds: np.ndarray = np.arange(start=0.05, stop=1.05, step=0.05)
    output: np.ndarray = np.vectorize(lambda x: f1_score(y_true=y, y_pred=model.predict_proba(X)[:, 1] > x, sample_weight=sample_weight))(thresholds)
    max_index: int = np.argmax(output)

    return float(thresholds[max_index]), float(output[max_index])

def train_valid_test_split(X: pd.DataFrame, y: pd.Series | np.ndarray, valid_size: float = 0.1, test_size: float = 0.1, seed: int = 42):
    if test_size + valid_size > 1.:
        raise ValueError(f"validation and test size must be less than 1.0")

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=valid_size, random_state=seed, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=test_size/X_train.shape[0], random_state=seed, stratify=y_train)
    return X_train, X_valid, X_test, y_train, y_valid, y_test

def get_learning_curves(model: lgb.LGBMModel, metric: str = "binary_logloss") -> go.Figure:
    keys = list(model.evals_result_.keys())
    train_key, valid_key = "", ""
    for key in keys:
        if "train" in key:
            train_key = key
        elif "valid" in key:
            valid_key = key

    results = pd.DataFrame({"train": model.evals_result_[train_key][metric], "valid": model.evals_result_[valid_key][metric]})

    fig = go.Figure()
    for col in results.columns:
        fig.add_trace(go.Scatter(x=results.index+1, y=results[col], mode='lines', name=f"{col}"))

    fig.update_layout(title=f'Learning curves {metric}',
                      xaxis_title='Iteration',
                      yaxis_title=metric,
                      margin=dict(l=0,r=0,b=0,t=30),
                      )
    return fig

In [6]:
def define_pipeline(model: BaseEstimator, X: pd.DataFrame, scale_features: bool = True, use_one_hot_encoding: bool = True, use_ordinal_encoding: bool = True) -> Pipeline:
    standardized_columns: list[str] = X.select_dtypes(include="number").columns.tolist()
    one_hot_encoded_columns, ordinal_encoded_columns = retrieve_categorical_columns_to_encode(X)

    transforms = []
    if use_one_hot_encoding:
        transforms.append((OneHotEncoder(dtype=np.float32, sparse_output=False, drop="first", handle_unknown="ignore"), one_hot_encoded_columns))
    if use_ordinal_encoding:
        transforms.append((OrdinalEncoder(dtype=np.float32, unknown_value=-1, handle_unknown="use_encoded_value"), ordinal_encoded_columns))
    if scale_features:
        transforms.append((RobustScaler(), standardized_columns))

    column_transformer = make_column_transformer(
        *transforms,
        remainder="passthrough",
        n_jobs=-1,
        verbose=1,
        verbose_feature_names_out=False
    ).set_output(transform="pandas")


    return Pipeline(
        steps=[
            ("column_transformer", column_transformer),
            ("model", model)
        ]
    )

def predict_and_compute_threshold(model: Pipeline, X: pd.DataFrame, y: pd.Series | np.ndarray, sample_weight: np.ndarray) -> tuple[np.ndarray, np.ndarray, float]:
    y_pred_proba = model.predict_proba(X)[:, 1]
    threshold, _ = compute_threshold(X=X, y=y, model=model, sample_weight=sample_weight)
    y_pred = y_pred_proba > threshold
    mlflow.log_metric("threshold", threshold)
    return y_pred_proba, y_pred, threshold

def calibrate_model(model: Pipeline, X: pd.DataFrame, y: pd.Series | np.ndarray, sample_weight: np.ndarray) -> CalibratedClassifierCV:
    fig = CalibrationDisplay.from_predictions(y_true=y, y_prob=model.predict_proba(X)[:, 1], n_bins=10, name="original calibration curve")
    mlflow.log_figure(fig.figure_, artifact_file="original_calibration_curve.png")
    threshold, _ = compute_threshold(X=X, y=y, model=model, sample_weight=sample_weight)
    mlflow.log_metric("threshold_uncalibrated", threshold)

    calibrated_classifier: CalibratedClassifierCV = CalibratedClassifierCV(model, cv="prefit", method="isotonic", n_jobs=-1)
    calibrated_classifier.fit(X, y, sample_weight=sample_weight)
    return calibrated_classifier

def train_logistic_regression_model(X: pd.DataFrame, y: pd.DataFrame) -> tuple[BaseEstimator, pd.DataFrame, dict[str, float]]:
    pipeline = define_pipeline(
        model=LogisticRegression(random_state=SEED, verbose=0, n_jobs=-1, warm_start=False),
        X=X,
        scale_features=True,
        use_one_hot_encoding=True,
        use_ordinal_encoding=True
    )

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
    train_sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
    valid_sample_weights = compute_sample_weight(class_weight="balanced", y=y_valid)

    with mlflow.start_run(nested=True, run_name=f"Full model", log_system_metrics=True):
        mlflow.log_param("columns", list(X.columns))
        pipeline.fit(X_train, y_train, model__sample_weight=train_sample_weights)

        y_pred_proba, y_pred, threshold = predict_and_compute_threshold(
            model=pipeline,
            X=X_valid,
            y=y_valid,
            sample_weight=valid_sample_weights
        )

        feature_importances: pd.DataFrame = compute_feature_importances(pipeline=pipeline, X_train=pipeline["column_transformer"].transform(X_train), X_valid=pipeline["column_transformer"].transform(X_valid))

        metrics: dict[str, float] = store_metrics(y_valid=y_valid, y_pred=y_pred, y_pred_proba=y_pred_proba, valid_sample_weight=valid_sample_weights)
        return pipeline, feature_importances, metrics

def train_random_random_forest(X: pd.DataFrame, y: pd.DataFrame) -> tuple[BaseEstimator, pd.DataFrame, dict[str, float]]:
    pipeline = define_pipeline(
        model=RandomForestClassifier(random_state=SEED, n_jobs=-1, verbose=0, warm_start=False),
        X=X,
        scale_features=False,
        use_one_hot_encoding=True,
        use_ordinal_encoding=True
    )

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
    train_sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
    valid_sample_weights = compute_sample_weight(class_weight="balanced", y=y_valid)

    with mlflow.start_run(nested=True, run_name=f"Full model", log_system_metrics=True):
        mlflow.log_param("columns", list(X.columns))
        pipeline.fit(X_train, y_train, model__sample_weight=train_sample_weights)

        calibrated_classifier = calibrate_model(
            model=pipeline,
            X=X_valid,
            y=y_valid,
            sample_weight=valid_sample_weights
        )

        y_pred_proba, y_pred, threshold = predict_and_compute_threshold(
            model=calibrated_classifier,
            X=X_valid,
            y=y_valid,
            sample_weight=valid_sample_weights
        )

        feature_importances: pd.DataFrame = pd.DataFrame(data={"column": pipeline["model"].feature_names_in_, "importance": pipeline["model"].feature_importances_}).sort_values(by="importance", ascending=False)
        feature_importances["importance"] = feature_importances["importance"].astype(np.float32)
        mlflow.log_table(feature_importances, "feat_importances.json")

        metrics: dict[str, float] = store_metrics(y_valid=y_valid, y_pred=y_pred, y_pred_proba=y_pred_proba, valid_sample_weight=valid_sample_weights)
        return calibrated_classifier, feature_importances, metrics

def train_lightgbm(X: pd.DataFrame, y: pd.DataFrame) -> tuple[BaseEstimator, pd.DataFrame, dict[str, float]]:
    model = lgb.LGBMClassifier(
        random_state=SEED,
        n_jobs=-1,
        boosting_type='gbdt',
        num_leaves=128,
        max_depth=8,
        learning_rate=0.1,
        n_estimators=1_000,
        subsample_for_bin=200_000,
        objective="binary",
        class_weight=None,
        min_split_gain=0.0,
        min_child_weight=0.001,
        min_child_samples=20, # min_data_in_leaf
        subsample=1.,
        subsample_freq=0,
        colsample_bytree=1.,
        # reg_alpha=10.0,
        # reg_lambda=10.0,
        importance_type='gain',
        device="cpu",
        deterministic=True,
        verbose=-1,
        # extra_trees=True,
        # extra_seed=SEED
    )

    pipeline = define_pipeline(
        model=model,
        X=X,
        scale_features=False,
        use_one_hot_encoding=True,
        use_ordinal_encoding=False
    )

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
    train_sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
    valid_sample_weights = compute_sample_weight(class_weight="balanced", y=y_valid)

    with mlflow.start_run(nested=True, run_name=f"Full model", log_system_metrics=True):
        mlflow.log_param("columns", list(X.columns))
        eval_results = {}
        pipeline.fit(
            X=X_train,
            y=y_train,
            model__sample_weight=train_sample_weights,
            model__eval_set=[(X_train, y_train), (X_valid, y_valid)],
            model__eval_metric=["binary_logloss", "average_precision", "auc"],
            model__eval_sample_weight=[train_sample_weights, valid_sample_weights],
            # eval_init_score=[train_init_score, valid_init_score],
            model__callbacks=[
                # lgb.log_evaluation(),
                lgb.record_evaluation(eval_results),
                lgb.early_stopping(stopping_rounds=100, first_metric_only=True),
            ],
        )

        y_pred_proba, y_pred, threshold = predict_and_compute_threshold(
            model=pipeline,
            X=X_valid,
            y=y_valid,
            sample_weight=valid_sample_weights
        )


        feature_importances: pd.DataFrame = pd.DataFrame(data={"column": pipeline["model"].feature_names_in_, "importance": pipeline["model"].feature_importances_}).sort_values(by="importance", ascending=False)
        feature_importances["importance"] = feature_importances["importance"].astype(np.float32)
        mlflow.log_table(feature_importances, "feat_importances.json")

        metrics: dict[str, float] = store_metrics(y_valid=y_valid, y_pred=y_pred, y_pred_proba=y_pred_proba, valid_sample_weight=valid_sample_weights)
        return pipeline, feature_importances, metrics

def train_iterative_model(X: pd.DataFrame, y: np.ndarray, train_model_fn: Callable):
    cols_to_drop = []

    with mlflow.start_run(nested=False):
        while True:
            model, feature_importances, metrics = train_model_fn(X=X.drop(columns=cols_to_drop), y=y)

            new_cols_to_drop = feature_importances[feature_importances.importance <= 1e-4].column.tolist()
            new_cols_to_drop = list(set(new_cols_to_drop).intersection(list(X.columns)))

            if not new_cols_to_drop:
                print(f"no new columns to drop")
                break
            else:
                cols_to_drop.extend(new_cols_to_drop)

In [8]:
from fraud_detection.preprocessing.identities import load_and_preprocess_identities

identities: pl.LazyFrame = load_and_preprocess_identities()

KeyboardInterrupt: 

In [7]:
set_seed(SEED)
TARGET_COLUMN: str = "isFraud"
data: pl.LazyFrame = preprocess_data_for_training()
X: pl.LazyFrame = data.drop(TARGET_COLUMN)
y: pl.LazyFrame = data.select(TARGET_COLUMN)

ColumnNotFoundError: id_33

Error originated just after this operation:
 WITH_COLUMNS:
 [col("TransactionID").fill_null([col("TransactionID").median()]).shrink_dtype().alias("TransactionID"), col("id_01").fill_null([col("id_01").median()]).shrink_dtype().alias("id_01"), col("id_02").fill_null([col("id_02").median()]).shrink_dtype().alias("id_02"), col("id_03").fill_null([col("id_03").median()]).shrink_dtype().alias("id_03"), col("id_04").fill_null([col("id_04").median()]).shrink_dtype().alias("id_04"), col("id_05").fill_null([col("id_05").median()]).shrink_dtype().alias("id_05"), col("id_06").fill_null([col("id_06").median()]).shrink_dtype().alias("id_06"), col("id_07").fill_null([col("id_07").median()]).shrink_dtype().alias("id_07"), col("id_08").fill_null([col("id_08").median()]).shrink_dtype().alias("id_08"), col("id_09").fill_null([col("id_09").median()]).shrink_dtype().alias("id_09"), col("id_10").fill_null([col("id_10").median()]).shrink_dtype().alias("id_10"), col("id_11").fill_null([col("id_11").median()]).shrink_dtype().alias("id_11"), col("id_13").fill_null([col("id_13").median()]).shrink_dtype().alias("id_13"), col("id_14").fill_null([col("id_14").median()]).shrink_dtype().alias("id_14"), col("id_17").fill_null([col("id_17").median()]).shrink_dtype().alias("id_17"), col("id_18").fill_null([col("id_18").median()]).shrink_dtype().alias("id_18"), col("id_19").fill_null([col("id_19").median()]).shrink_dtype().alias("id_19"), col("id_20").fill_null([col("id_20").median()]).shrink_dtype().alias("id_20"), col("id_21").fill_null([col("id_21").median()]).shrink_dtype().alias("id_21"), col("id_22").fill_null([col("id_22").median()]).shrink_dtype().alias("id_22"), col("id_24").fill_null([col("id_24").median()]).shrink_dtype().alias("id_24"), col("id_25").fill_null([col("id_25").median()]).shrink_dtype().alias("id_25"), col("id_26").fill_null([col("id_26").median()]).shrink_dtype().alias("id_26"), col("id_32").fill_null([col("id_32").median()]).shrink_dtype().alias("id_32"), col("width").fill_null([col("width").median()]).shrink_dtype().alias("width"), col("height").fill_null([col("height").median()]).shrink_dtype().alias("height")]
   SELECT [col("TransactionID"), col("id_01"), col("id_02"), col("id_03"), col("id_04"), col("id_05"), col("id_06"), col("id_07"), col("id_08"), col("id_09"), col("id_10"), col("id_11"), col("id_12"), col("id_13"), col("id_14"), col("id_15"), col("id_16"), col("id_17"), col("id_18"), col("id_19"), col("id_20"), col("id_21"), col("id_22"), col("id_23"), col("id_24"), col("id_25"), col("id_26"), col("id_27"), col("id_28"), col("id_29"), col("id_30"), col("id_31"), col("id_32"), col("width"), col("height"), col("id_34"), col("id_35"), col("id_36"), col("id_37"), col("id_38"), col("DeviceType")] FROM

      Parquet SCAN data/train_identity_processed.parquet
      PROJECT */41 COLUMNS

LogicalPlan had already failed with the above error; after failure, 3 additional operations were attempted on the LazyFrame

Error originated just after this operation:
UNNEST by:[id_33]
ErrorState { n_times: 4, err: ColumnNotFound(ErrString("id_33\n\nError originated just after this operation:\n WITH_COLUMNS:\n [col(\"TransactionID\").fill_null([col(\"TransactionID\").median()]).shrink_dtype().alias(\"TransactionID\"), col(\"id_01\").fill_null([col(\"id_01\").median()]).shrink_dtype().alias(\"id_01\"), col(\"id_02\").fill_null([col(\"id_02\").median()]).shrink_dtype().alias(\"id_02\"), col(\"id_03\").fill_null([col(\"id_03\").median()]).shrink_dtype().alias(\"id_03\"), col(\"id_04\").fill_null([col(\"id_04\").median()]).shrink_dtype().alias(\"id_04\"), col(\"id_05\").fill_null([col(\"id_05\").median()]).shrink_dtype().alias(\"id_05\"), col(\"id_06\").fill_null([col(\"id_06\").median()]).shrink_dtype().alias(\"id_06\"), col(\"id_07\").fill_null([col(\"id_07\").median()]).shrink_dtype().alias(\"id_07\"), col(\"id_08\").fill_null([col(\"id_08\").median()]).shrink_dtype().alias(\"id_08\"), col(\"id_09\").fill_null([col(\"id_09\").median()]).shrink_dtype().alias(\"id_09\"), col(\"id_10\").fill_null([col(\"id_10\").median()]).shrink_dtype().alias(\"id_10\"), col(\"id_11\").fill_null([col(\"id_11\").median()]).shrink_dtype().alias(\"id_11\"), col(\"id_13\").fill_null([col(\"id_13\").median()]).shrink_dtype().alias(\"id_13\"), col(\"id_14\").fill_null([col(\"id_14\").median()]).shrink_dtype().alias(\"id_14\"), col(\"id_17\").fill_null([col(\"id_17\").median()]).shrink_dtype().alias(\"id_17\"), col(\"id_18\").fill_null([col(\"id_18\").median()]).shrink_dtype().alias(\"id_18\"), col(\"id_19\").fill_null([col(\"id_19\").median()]).shrink_dtype().alias(\"id_19\"), col(\"id_20\").fill_null([col(\"id_20\").median()]).shrink_dtype().alias(\"id_20\"), col(\"id_21\").fill_null([col(\"id_21\").median()]).shrink_dtype().alias(\"id_21\"), col(\"id_22\").fill_null([col(\"id_22\").median()]).shrink_dtype().alias(\"id_22\"), col(\"id_24\").fill_null([col(\"id_24\").median()]).shrink_dtype().alias(\"id_24\"), col(\"id_25\").fill_null([col(\"id_25\").median()]).shrink_dtype().alias(\"id_25\"), col(\"id_26\").fill_null([col(\"id_26\").median()]).shrink_dtype().alias(\"id_26\"), col(\"id_32\").fill_null([col(\"id_32\").median()]).shrink_dtype().alias(\"id_32\"), col(\"width\").fill_null([col(\"width\").median()]).shrink_dtype().alias(\"width\"), col(\"height\").fill_null([col(\"height\").median()]).shrink_dtype().alias(\"height\")]\n   SELECT [col(\"TransactionID\"), col(\"id_01\"), col(\"id_02\"), col(\"id_03\"), col(\"id_04\"), col(\"id_05\"), col(\"id_06\"), col(\"id_07\"), col(\"id_08\"), col(\"id_09\"), col(\"id_10\"), col(\"id_11\"), col(\"id_12\"), col(\"id_13\"), col(\"id_14\"), col(\"id_15\"), col(\"id_16\"), col(\"id_17\"), col(\"id_18\"), col(\"id_19\"), col(\"id_20\"), col(\"id_21\"), col(\"id_22\"), col(\"id_23\"), col(\"id_24\"), col(\"id_25\"), col(\"id_26\"), col(\"id_27\"), col(\"id_28\"), col(\"id_29\"), col(\"id_30\"), col(\"id_31\"), col(\"id_32\"), col(\"width\"), col(\"height\"), col(\"id_34\"), col(\"id_35\"), col(\"id_36\"), col(\"id_37\"), col(\"id_38\"), col(\"DeviceType\")] FROM\n\n      Parquet SCAN data/train_identity_processed.parquet\n      PROJECT */41 COLUMNS")) }

Error originated just after this operation:
 WITH_COLUMNS:
 [col("TransactionID").strict_cast(Int64)]
   WITH_COLUMNS:
   [col("M1").fill_null([String(unknown)]), col("M2").fill_null([String(unknown)]), col("M3").fill_null([String(unknown)]), col("M4").fill_null([String(unknown)]), col("M5").fill_null([String(unknown)]), col("M6").fill_null([String(unknown)]), col("M7").fill_null([String(unknown)]), col("M8").fill_null([String(unknown)]), col("M9").fill_null([String(unknown)]), col("card4").fill_null([String(unknown)]), col("card6").fill_null([String(unknown)]), col("ProductCD").fill_null([String(unknown)]), col("R_emaildomain").str.split([String(.)]).list.get([0]).fill_null([String(unknown)]), col("P_emaildomain").str.split([String(.)]).list.get([0]).fill_null([String(unknown)])]
     WITH_COLUMNS:
     [col("TransactionID").fill_null([col("TransactionID").median()]).shrink_dtype().alias("TransactionID"), col("isFraud").fill_null([col("isFraud").median()]).shrink_dtype().alias("isFraud"), col("TransactionDT").fill_null([col("TransactionDT").median()]).shrink_dtype().alias("TransactionDT"), col("TransactionAmt").fill_null([col("TransactionAmt").median()]).shrink_dtype().alias("TransactionAmt"), col("card1").fill_null([col("card1").median()]).shrink_dtype().alias("card1"), col("card2").fill_null([col("card2").median()]).shrink_dtype().alias("card2"), col("card3").fill_null([col("card3").median()]).shrink_dtype().alias("card3"), col("card5").fill_null([col("card5").median()]).shrink_dtype().alias("card5"), col("addr1").fill_null([col("addr1").median()]).shrink_dtype().alias("addr1"), col("addr2").fill_null([col("addr2").median()]).shrink_dtype().alias("addr2"), col("dist1").fill_null([col("dist1").median()]).shrink_dtype().alias("dist1"), col("dist2").fill_null([col("dist2").median()]).shrink_dtype().alias("dist2"), col("C1").fill_null([col("C1").median()]).shrink_dtype().alias("C1"), col("C2").fill_null([col("C2").median()]).shrink_dtype().alias("C2"), col("C3").fill_null([col("C3").median()]).shrink_dtype().alias("C3"), col("C4").fill_null([col("C4").median()]).shrink_dtype().alias("C4"), col("C5").fill_null([col("C5").median()]).shrink_dtype().alias("C5"), col("C6").fill_null([col("C6").median()]).shrink_dtype().alias("C6"), col("C7").fill_null([col("C7").median()]).shrink_dtype().alias("C7"), col("C8").fill_null([col("C8").median()]).shrink_dtype().alias("C8"), col("C9").fill_null([col("C9").median()]).shrink_dtype().alias("C9"), col("C10").fill_null([col("C10").median()]).shrink_dtype().alias("C10"), col("C11").fill_null([col("C11").median()]).shrink_dtype().alias("C11"), col("C12").fill_null([col("C12").median()]).shrink_dtype().alias("C12"), col("C13").fill_null([col("C13").median()]).shrink_dtype().alias("C13"), col("C14").fill_null([col("C14").median()]).shrink_dtype().alias("C14"), col("D1").fill_null([col("D1").median()]).shrink_dtype().alias("D1"), col("D2").fill_null([col("D2").median()]).shrink_dtype().alias("D2"), col("D3").fill_null([col("D3").median()]).shrink_dtype().alias("D3"), col("D4").fill_null([col("D4").median()]).shrink_dtype().alias("D4"), col("D5").fill_null([col("D5").median()]).shrink_dtype().alias("D5"), col("D6").fill_null([col("D6").median()]).shrink_dtype().alias("D6"), col("D7").fill_null([col("D7").median()]).shrink_dtype().alias("D7"), col("D8").fill_null([col("D8").median()]).shrink_dtype().alias("D8"), col("D9").fill_null([col("D9").median()]).shrink_dtype().alias("D9"), col("D10").fill_null([col("D10").median()]).shrink_dtype().alias("D10"), col("D11").fill_null([col("D11").median()]).shrink_dtype().alias("D11"), col("D12").fill_null([col("D12").median()]).shrink_dtype().alias("D12"), col("D13").fill_null([col("D13").median()]).shrink_dtype().alias("D13"), col("D14").fill_null([col("D14").median()]).shrink_dtype().alias("D14"), col("D15").fill_null([col("D15").median()]).shrink_dtype().alias("D15"), col("V1").fill_null([col("V1").median()]).shrink_dtype().alias("V1"), col("V2").fill_null([col("V2").median()]).shrink_dtype().alias("V2"), col("V3").fill_null([col("V3").median()]).shrink_dtype().alias("V3"), col("V4").fill_null([col("V4").median()]).shrink_dtype().alias("V4"), col("V5").fill_null([col("V5").median()]).shrink_dtype().alias("V5"), col("V6").fill_null([col("V6").median()]).shrink_dtype().alias("V6"), col("V7").fill_null([col("V7").median()]).shrink_dtype().alias("V7"), col("V8").fill_null([col("V8").median()]).shrink_dtype().alias("V8"), col("V9").fill_null([col("V9").median()]).shrink_dtype().alias("V9"), col("V10").fill_null([col("V10").median()]).shrink_dtype().alias("V10"), col("V11").fill_null([col("V11").median()]).shrink_dtype().alias("V11"), col("V12").fill_null([col("V12").median()]).shrink_dtype().alias("V12"), col("V13").fill_null([col("V13").median()]).shrink_dtype().alias("V13"), col("V14").fill_null([col("V14").median()]).shrink_dtype().alias("V14"), col("V15").fill_null([col("V15").median()]).shrink_dtype().alias("V15"), col("V16").fill_null([col("V16").median()]).shrink_dtype().alias("V16"), col("V17").fill_null([col("V17").median()]).shrink_dtype().alias("V17"), col("V18").fill_null([col("V18").median()]).shrink_dtype().alias("V18"), col("V19").fill_null([col("V19").median()]).shrink_dtype().alias("V19"), col("V20").fill_null([col("V20").median()]).shrink_dtype().alias("V20"), col("V21").fill_null([col("V21").median()]).shrink_dtype().alias("V21"), col("V22").fill_null([col("V22").median()]).shrink_dtype().alias("V22"), col("V23").fill_null([col("V23").median()]).shrink_dtype().alias("V23"), col("V24").fill_null([col("V24").median()]).shrink_dtype().alias("V24"), col("V25").fill_null([col("V25").median()]).shrink_dtype().alias("V25"), col("V26").fill_null([col("V26").median()]).shrink_dtype().alias("V26"), col("V27").fill_null([col("V27").median()]).shrink_dtype().alias("V27"), col("V28").fill_null([col("V28").median()]).shrink_dtype().alias("V28"), col("V29").fill_null([col("V29").median()]).shrink_dtype().alias("V29"), col("V30").fill_null([col("V30").median()]).shrink_dtype().alias("V30"), col("V31").fill_null([col("V31").median()]).shrink_dtype().alias("V31"), col("V32").fill_null([col("V32").median()]).shrink_dtype().alias("V32"), col("V33").fill_null([col("V33").median()]).shrink_dtype().alias("V33"), col("V34").fill_null([col("V34").median()]).shrink_dtype().alias("V34"), col("V35").fill_null([col("V35").median()]).shrink_dtype().alias("V35"), col("V36").fill_null([col("V36").median()]).shrink_dtype().alias("V36"), col("V37").fill_null([col("V37").median()]).shrink_dtype().alias("V37"), col("V38").fill_null([col("V38").median()]).shrink_dtype().alias("V38"), col("V39").fill_null([col("V39").median()]).shrink_dtype().alias("V39"), col("V40").fill_null([col("V40").median()]).shrink_dtype().alias("V40"), col("V41").fill_null([col("V41").median()]).shrink_dtype().alias("V41"), col("V42").fill_null([col("V42").median()]).shrink_dtype().alias("V42"), col("V43").fill_null([col("V43").median()]).shrink_dtype().alias("V43"), col("V44").fill_null([col("V44").median()]).shrink_dtype().alias("V44"), col("V45").fill_null([col("V45").median()]).shrink_dtype().alias("V45"), col("V46").fill_null([col("V46").median()]).shrink_dtype().alias("V46"), col("V47").fill_null([col("V47").median()]).shrink_dtype().alias("V47"), col("V48").fill_null([col("V48").median()]).shrink_dtype().alias("V48"), col("V49").fill_null([col("V49").median()]).shrink_dtype().alias("V49"), col("V50").fill_null([col("V50").median()]).shrink_dtype().alias("V50"), col("V51").fill_null([col("V51").median()]).shrink_dtype().alias("V51"), col("V52").fill_null([col("V52").median()]).shrink_dtype().alias("V52"), col("V53").fill_null([col("V53").median()]).shrink_dtype().alias("V53"), col("V54").fill_null([col("V54").median()]).shrink_dtype().alias("V54"), col("V55").fill_null([col("V55").median()]).shrink_dtype().alias("V55"), col("V56").fill_null([col("V56").median()]).shrink_dtype().alias("V56"), col("V57").fill_null([col("V57").median()]).shrink_dtype().alias("V57"), col("V58").fill_null([col("V58").median()]).shrink_dtype().alias("V58"), col("V59").fill_null([col("V59").median()]).shrink_dtype().alias("V59"), col("V60").fill_null([col("V60").median()]).shrink_dtype().alias("V60"), col("V61").fill_null([col("V61").median()]).shrink_dtype().alias("V61"), col("V62").fill_null([col("V62").median()]).shrink_dtype().alias("V62"), col("V63").fill_null([col("V63").median()]).shrink_dtype().alias("V63"), col("V64").fill_null([col("V64").median()]).shrink_dtype().alias("V64"), col("V65").fill_null([col("V65").median()]).shrink_dtype().alias("V65"), col("V66").fill_null([col("V66").median()]).shrink_dtype().alias("V66"), col("V67").fill_null([col("V67").median()]).shrink_dtype().alias("V67"), col("V68").fill_null([col("V68").median()]).shrink_dtype().alias("V68"), col("V69").fill_null([col("V69").median()]).shrink_dtype().alias("V69"), col("V70").fill_null([col("V70").median()]).shrink_dtype().alias("V70"), col("V71").fill_null([col("V71").median()]).shrink_dtype().alias("V71"), col("V72").fill_null([col("V72").median()]).shrink_dtype().alias("V72"), col("V73").fill_null([col("V73").median()]).shrink_dtype().alias("V73"), col("V74").fill_null([col("V74").median()]).shrink_dtype().alias("V74"), col("V75").fill_null([col("V75").median()]).shrink_dtype().alias("V75"), col("V76").fill_null([col("V76").median()]).shrink_dtype().alias("V76"), col("V77").fill_null([col("V77").median()]).shrink_dtype().alias("V77"), col("V78").fill_null([col("V78").median()]).shrink_dtype().alias("V78"), col("V79").fill_null([col("V79").median()]).shrink_dtype().alias("V79"), col("V80").fill_null([col("V80").median()]).shrink_dtype().alias("V80"), col("V81").fill_null([col("V81").median()]).shrink_dtype().alias("V81"), col("V82").fill_null([col("V82").median()]).shrink_dtype().alias("V82"), col("V83").fill_null([col("V83").median()]).shrink_dtype().alias("V83"), col("V84").fill_null([col("V84").median()]).shrink_dtype().alias("V84"), col("V85").fill_null([col("V85").median()]).shrink_dtype().alias("V85"), col("V86").fill_null([col("V86").median()]).shrink_dtype().alias("V86"), col("V87").fill_null([col("V87").median()]).shrink_dtype().alias("V87"), col("V88").fill_null([col("V88").median()]).shrink_dtype().alias("V88"), col("V89").fill_null([col("V89").median()]).shrink_dtype().alias("V89"), col("V90").fill_null([col("V90").median()]).shrink_dtype().alias("V90"), col("V91").fill_null([col("V91").median()]).shrink_dtype().alias("V91"), col("V92").fill_null([col("V92").median()]).shrink_dtype().alias("V92"), col("V93").fill_null([col("V93").median()]).shrink_dtype().alias("V93"), col("V94").fill_null([col("V94").median()]).shrink_dtype().alias("V94"), col("V95").fill_null([col("V95").median()]).shrink_dtype().alias("V95"), col("V96").fill_null([col("V96").median()]).shrink_dtype().alias("V96"), col("V97").fill_null([col("V97").median()]).shrink_dtype().alias("V97"), col("V98").fill_null([col("V98").median()]).shrink_dtype().alias("V98"), col("V99").fill_null([col("V99").median()]).shrink_dtype().alias("V99"), col("V100").fill_null([col("V100").median()]).shrink_dtype().alias("V100"), col("V101").fill_null([col("V101").median()]).shrink_dtype().alias("V101"), col("V102").fill_null([col("V102").median()]).shrink_dtype().alias("V102"), col("V103").fill_null([col("V103").median()]).shrink_dtype().alias("V103"), col("V104").fill_null([col("V104").median()]).shrink_dtype().alias("V104"), col("V105").fill_null([col("V105").median()]).shrink_dtype().alias("V105"), col("V106").fill_null([col("V106").median()]).shrink_dtype().alias("V106"), col("V107").fill_null([col("V107").median()]).shrink_dtype().alias("V107"), col("V108").fill_null([col("V108").median()]).shrink_dtype().alias("V108"), col("V109").fill_null([col("V109").median()]).shrink_dtype().alias("V109"), col("V110").fill_null([col("V110").median()]).shrink_dtype().alias("V110"), col("V111").fill_null([col("V111").median()]).shrink_dtype().alias("V111"), col("V112").fill_null([col("V112").median()]).shrink_dtype().alias("V112"), col("V113").fill_null([col("V113").median()]).shrink_dtype().alias("V113"), col("V114").fill_null([col("V114").median()]).shrink_dtype().alias("V114"), col("V115").fill_null([col("V115").median()]).shrink_dtype().alias("V115"), col("V116").fill_null([col("V116").median()]).shrink_dtype().alias("V116"), col("V117").fill_null([col("V117").median()]).shrink_dtype().alias("V117"), col("V118").fill_null([col("V118").median()]).shrink_dtype().alias("V118"), col("V119").fill_null([col("V119").median()]).shrink_dtype().alias("V119"), col("V120").fill_null([col("V120").median()]).shrink_dtype().alias("V120"), col("V121").fill_null([col("V121").median()]).shrink_dtype().alias("V121"), col("V122").fill_null([col("V122").median()]).shrink_dtype().alias("V122"), col("V123").fill_null([col("V123").median()]).shrink_dtype().alias("V123"), col("V124").fill_null([col("V124").median()]).shrink_dtype().alias("V124"), col("V125").fill_null([col("V125").median()]).shrink_dtype().alias("V125"), col("V126").fill_null([col("V126").median()]).shrink_dtype().alias("V126"), col("V127").fill_null([col("V127").median()]).shrink_dtype().alias("V127"), col("V128").fill_null([col("V128").median()]).shrink_dtype().alias("V128"), col("V129").fill_null([col("V129").median()]).shrink_dtype().alias("V129"), col("V130").fill_null([col("V130").median()]).shrink_dtype().alias("V130"), col("V131").fill_null([col("V131").median()]).shrink_dtype().alias("V131"), col("V132").fill_null([col("V132").median()]).shrink_dtype().alias("V132"), col("V133").fill_null([col("V133").median()]).shrink_dtype().alias("V133"), col("V134").fill_null([col("V134").median()]).shrink_dtype().alias("V134"), col("V135").fill_null([col("V135").median()]).shrink_dtype().alias("V135"), col("V136").fill_null([col("V136").median()]).shrink_dtype().alias("V136"), col("V137").fill_null([col("V137").median()]).shrink_dtype().alias("V137"), col("V138").fill_null([col("V138").median()]).shrink_dtype().alias("V138"), col("V139").fill_null([col("V139").median()]).shrink_dtype().alias("V139"), col("V140").fill_null([col("V140").median()]).shrink_dtype().alias("V140"), col("V141").fill_null([col("V141").median()]).shrink_dtype().alias("V141"), col("V142").fill_null([col("V142").median()]).shrink_dtype().alias("V142"), col("V143").fill_null([col("V143").median()]).shrink_dtype().alias("V143"), col("V144").fill_null([col("V144").median()]).shrink_dtype().alias("V144"), col("V145").fill_null([col("V145").median()]).shrink_dtype().alias("V145"), col("V146").fill_null([col("V146").median()]).shrink_dtype().alias("V146"), col("V147").fill_null([col("V147").median()]).shrink_dtype().alias("V147"), col("V148").fill_null([col("V148").median()]).shrink_dtype().alias("V148"), col("V149").fill_null([col("V149").median()]).shrink_dtype().alias("V149"), col("V150").fill_null([col("V150").median()]).shrink_dtype().alias("V150"), col("V151").fill_null([col("V151").median()]).shrink_dtype().alias("V151"), col("V152").fill_null([col("V152").median()]).shrink_dtype().alias("V152"), col("V153").fill_null([col("V153").median()]).shrink_dtype().alias("V153"), col("V154").fill_null([col("V154").median()]).shrink_dtype().alias("V154"), col("V155").fill_null([col("V155").median()]).shrink_dtype().alias("V155"), col("V156").fill_null([col("V156").median()]).shrink_dtype().alias("V156"), col("V157").fill_null([col("V157").median()]).shrink_dtype().alias("V157"), col("V158").fill_null([col("V158").median()]).shrink_dtype().alias("V158"), col("V159").fill_null([col("V159").median()]).shrink_dtype().alias("V159"), col("V160").fill_null([col("V160").median()]).shrink_dtype().alias("V160"), col("V161").fill_null([col("V161").median()]).shrink_dtype().alias("V161"), col("V162").fill_null([col("V162").median()]).shrink_dtype().alias("V162"), col("V163").fill_null([col("V163").median()]).shrink_dtype().alias("V163"), col("V164").fill_null([col("V164").median()]).shrink_dtype().alias("V164"), col("V165").fill_null([col("V165").median()]).shrink_dtype().alias("V165"), col("V166").fill_null([col("V166").median()]).shrink_dtype().alias("V166"), col("V167").fill_null([col("V167").median()]).shrink_dtype().alias("V167"), col("V168").fill_null([col("V168").median()]).shrink_dtype().alias("V168"), col("V169").fill_null([col("V169").median()]).shrink_dtype().alias("V169"), col("V170").fill_null([col("V170").median()]).shrink_dtype().alias("V170"), col("V171").fill_null([col("V171").median()]).shrink_dtype().alias("V171"), col("V172").fill_null([col("V172").median()]).shrink_dtype().alias("V172"), col("V173").fill_null([col("V173").median()]).shrink_dtype().alias("V173"), col("V174").fill_null([col("V174").median()]).shrink_dtype().alias("V174"), col("V175").fill_null([col("V175").median()]).shrink_dtype().alias("V175"), col("V176").fill_null([col("V176").median()]).shrink_dtype().alias("V176"), col("V177").fill_null([col("V177").median()]).shrink_dtype().alias("V177"), col("V178").fill_null([col("V178").median()]).shrink_dtype().alias("V178"), col("V179").fill_null([col("V179").median()]).shrink_dtype().alias("V179"), col("V180").fill_null([col("V180").median()]).shrink_dtype().alias("V180"), col("V181").fill_null([col("V181").median()]).shrink_dtype().alias("V181"), col("V182").fill_null([col("V182").median()]).shrink_dtype().alias("V182"), col("V183").fill_null([col("V183").median()]).shrink_dtype().alias("V183"), col("V184").fill_null([col("V184").median()]).shrink_dtype().alias("V184"), col("V185").fill_null([col("V185").median()]).shrink_dtype().alias("V185"), col("V186").fill_null([col("V186").median()]).shrink_dtype().alias("V186"), col("V187").fill_null([col("V187").median()]).shrink_dtype().alias("V187"), col("V188").fill_null([col("V188").median()]).shrink_dtype().alias("V188"), col("V189").fill_null([col("V189").median()]).shrink_dtype().alias("V189"), col("V190").fill_null([col("V190").median()]).shrink_dtype().alias("V190"), col("V191").fill_null([col("V191").median()]).shrink_dtype().alias("V191"), col("V192").fill_null([col("V192").median()]).shrink_dtype().alias("V192"), col("V193").fill_null([col("V193").median()]).shrink_dtype().alias("V193"), col("V194").fill_null([col("V194").median()]).shrink_dtype().alias("V194"), col("V195").fill_null([col("V195").median()]).shrink_dtype().alias("V195"), col("V196").fill_null([col("V196").median()]).shrink_dtype().alias("V196"), col("V197").fill_null([col("V197").median()]).shrink_dtype().alias("V197"), col("V198").fill_null([col("V198").median()]).shrink_dtype().alias("V198"), col("V199").fill_null([col("V199").median()]).shrink_dtype().alias("V199"), col("V200").fill_null([col("V200").median()]).shrink_dtype().alias("V200"), col("V201").fill_null([col("V201").median()]).shrink_dtype().alias("V201"), col("V202").fill_null([col("V202").median()]).shrink_dtype().alias("V202"), col("V203").fill_null([col("V203").median()]).shrink_dtype().alias("V203"), col("V204").fill_null([col("V204").median()]).shrink_dtype().alias("V204"), col("V205").fill_null([col("V205").median()]).shrink_dtype().alias("V205"), col("V206").fill_null([col("V206").median()]).shrink_dtype().alias("V206"), col("V207").fill_null([col("V207").median()]).shrink_dtype().alias("V207"), col("V208").fill_null([col("V208").median()]).shrink_dtype().alias("V208"), col("V209").fill_null([col("V209").median()]).shrink_dtype().alias("V209"), col("V210").fill_null([col("V210").median()]).shrink_dtype().alias("V210"), col("V211").fill_null([col("V211").median()]).shrink_dtype().alias("V211"), col("V212").fill_null([col("V212").median()]).shrink_dtype().alias("V212"), col("V213").fill_null([col("V213").median()]).shrink_dtype().alias("V213"), col("V214").fill_null([col("V214").median()]).shrink_dtype().alias("V214"), col("V215").fill_null([col("V215").median()]).shrink_dtype().alias("V215"), col("V216").fill_null([col("V216").median()]).shrink_dtype().alias("V216"), col("V217").fill_null([col("V217").median()]).shrink_dtype().alias("V217"), col("V218").fill_null([col("V218").median()]).shrink_dtype().alias("V218"), col("V219").fill_null([col("V219").median()]).shrink_dtype().alias("V219"), col("V220").fill_null([col("V220").median()]).shrink_dtype().alias("V220"), col("V221").fill_null([col("V221").median()]).shrink_dtype().alias("V221"), col("V222").fill_null([col("V222").median()]).shrink_dtype().alias("V222"), col("V223").fill_null([col("V223").median()]).shrink_dtype().alias("V223"), col("V224").fill_null([col("V224").median()]).shrink_dtype().alias("V224"), col("V225").fill_null([col("V225").median()]).shrink_dtype().alias("V225"), col("V226").fill_null([col("V226").median()]).shrink_dtype().alias("V226"), col("V227").fill_null([col("V227").median()]).shrink_dtype().alias("V227"), col("V228").fill_null([col("V228").median()]).shrink_dtype().alias("V228"), col("V229").fill_null([col("V229").median()]).shrink_dtype().alias("V229"), col("V230").fill_null([col("V230").median()]).shrink_dtype().alias("V230"), col("V231").fill_null([col("V231").median()]).shrink_dtype().alias("V231"), col("V232").fill_null([col("V232").median()]).shrink_dtype().alias("V232"), col("V233").fill_null([col("V233").median()]).shrink_dtype().alias("V233"), col("V234").fill_null([col("V234").median()]).shrink_dtype().alias("V234"), col("V235").fill_null([col("V235").median()]).shrink_dtype().alias("V235"), col("V236").fill_null([col("V236").median()]).shrink_dtype().alias("V236"), col("V237").fill_null([col("V237").median()]).shrink_dtype().alias("V237"), col("V238").fill_null([col("V238").median()]).shrink_dtype().alias("V238"), col("V239").fill_null([col("V239").median()]).shrink_dtype().alias("V239"), col("V240").fill_null([col("V240").median()]).shrink_dtype().alias("V240"), col("V241").fill_null([col("V241").median()]).shrink_dtype().alias("V241"), col("V242").fill_null([col("V242").median()]).shrink_dtype().alias("V242"), col("V243").fill_null([col("V243").median()]).shrink_dtype().alias("V243"), col("V244").fill_null([col("V244").median()]).shrink_dtype().alias("V244"), col("V245").fill_null([col("V245").median()]).shrink_dtype().alias("V245"), col("V246").fill_null([col("V246").median()]).shrink_dtype().alias("V246"), col("V247").fill_null([col("V247").median()]).shrink_dtype().alias("V247"), col("V248").fill_null([col("V248").median()]).shrink_dtype().alias("V248"), col("V249").fill_null([col("V249").median()]).shrink_dtype().alias("V249"), col("V250").fill_null([col("V250").median()]).shrink_dtype().alias("V250"), col("V251").fill_null([col("V251").median()]).shrink_dtype().alias("V251"), col("V252").fill_null([col("V252").median()]).shrink_dtype().alias("V252"), col("V253").fill_null([col("V253").median()]).shrink_dtype().alias("V253"), col("V254").fill_null([col("V254").median()]).shrink_dtype().alias("V254"), col("V255").fill_null([col("V255").median()]).shrink_dtype().alias("V255"), col("V256").fill_null([col("V256").median()]).shrink_dtype().alias("V256"), col("V257").fill_null([col("V257").median()]).shrink_dtype().alias("V257"), col("V258").fill_null([col("V258").median()]).shrink_dtype().alias("V258"), col("V259").fill_null([col("V259").median()]).shrink_dtype().alias("V259"), col("V260").fill_null([col("V260").median()]).shrink_dtype().alias("V260"), col("V261").fill_null([col("V261").median()]).shrink_dtype().alias("V261"), col("V262").fill_null([col("V262").median()]).shrink_dtype().alias("V262"), col("V263").fill_null([col("V263").median()]).shrink_dtype().alias("V263"), col("V264").fill_null([col("V264").median()]).shrink_dtype().alias("V264"), col("V265").fill_null([col("V265").median()]).shrink_dtype().alias("V265"), col("V266").fill_null([col("V266").median()]).shrink_dtype().alias("V266"), col("V267").fill_null([col("V267").median()]).shrink_dtype().alias("V267"), col("V268").fill_null([col("V268").median()]).shrink_dtype().alias("V268"), col("V269").fill_null([col("V269").median()]).shrink_dtype().alias("V269"), col("V270").fill_null([col("V270").median()]).shrink_dtype().alias("V270"), col("V271").fill_null([col("V271").median()]).shrink_dtype().alias("V271"), col("V272").fill_null([col("V272").median()]).shrink_dtype().alias("V272"), col("V273").fill_null([col("V273").median()]).shrink_dtype().alias("V273"), col("V274").fill_null([col("V274").median()]).shrink_dtype().alias("V274"), col("V275").fill_null([col("V275").median()]).shrink_dtype().alias("V275"), col("V276").fill_null([col("V276").median()]).shrink_dtype().alias("V276"), col("V277").fill_null([col("V277").median()]).shrink_dtype().alias("V277"), col("V278").fill_null([col("V278").median()]).shrink_dtype().alias("V278"), col("V279").fill_null([col("V279").median()]).shrink_dtype().alias("V279"), col("V280").fill_null([col("V280").median()]).shrink_dtype().alias("V280"), col("V281").fill_null([col("V281").median()]).shrink_dtype().alias("V281"), col("V282").fill_null([col("V282").median()]).shrink_dtype().alias("V282"), col("V283").fill_null([col("V283").median()]).shrink_dtype().alias("V283"), col("V284").fill_null([col("V284").median()]).shrink_dtype().alias("V284"), col("V285").fill_null([col("V285").median()]).shrink_dtype().alias("V285"), col("V286").fill_null([col("V286").median()]).shrink_dtype().alias("V286"), col("V287").fill_null([col("V287").median()]).shrink_dtype().alias("V287"), col("V288").fill_null([col("V288").median()]).shrink_dtype().alias("V288"), col("V289").fill_null([col("V289").median()]).shrink_dtype().alias("V289"), col("V290").fill_null([col("V290").median()]).shrink_dtype().alias("V290"), col("V291").fill_null([col("V291").median()]).shrink_dtype().alias("V291"), col("V292").fill_null([col("V292").median()]).shrink_dtype().alias("V292"), col("V293").fill_null([col("V293").median()]).shrink_dtype().alias("V293"), col("V294").fill_null([col("V294").median()]).shrink_dtype().alias("V294"), col("V295").fill_null([col("V295").median()]).shrink_dtype().alias("V295"), col("V296").fill_null([col("V296").median()]).shrink_dtype().alias("V296"), col("V297").fill_null([col("V297").median()]).shrink_dtype().alias("V297"), col("V298").fill_null([col("V298").median()]).shrink_dtype().alias("V298"), col("V299").fill_null([col("V299").median()]).shrink_dtype().alias("V299"), col("V300").fill_null([col("V300").median()]).shrink_dtype().alias("V300"), col("V301").fill_null([col("V301").median()]).shrink_dtype().alias("V301"), col("V302").fill_null([col("V302").median()]).shrink_dtype().alias("V302"), col("V303").fill_null([col("V303").median()]).shrink_dtype().alias("V303"), col("V304").fill_null([col("V304").median()]).shrink_dtype().alias("V304"), col("V305").fill_null([col("V305").median()]).shrink_dtype().alias("V305"), col("V306").fill_null([col("V306").median()]).shrink_dtype().alias("V306"), col("V307").fill_null([col("V307").median()]).shrink_dtype().alias("V307"), col("V308").fill_null([col("V308").median()]).shrink_dtype().alias("V308"), col("V309").fill_null([col("V309").median()]).shrink_dtype().alias("V309"), col("V310").fill_null([col("V310").median()]).shrink_dtype().alias("V310"), col("V311").fill_null([col("V311").median()]).shrink_dtype().alias("V311"), col("V312").fill_null([col("V312").median()]).shrink_dtype().alias("V312"), col("V313").fill_null([col("V313").median()]).shrink_dtype().alias("V313"), col("V314").fill_null([col("V314").median()]).shrink_dtype().alias("V314"), col("V315").fill_null([col("V315").median()]).shrink_dtype().alias("V315"), col("V316").fill_null([col("V316").median()]).shrink_dtype().alias("V316"), col("V317").fill_null([col("V317").median()]).shrink_dtype().alias("V317"), col("V318").fill_null([col("V318").median()]).shrink_dtype().alias("V318"), col("V319").fill_null([col("V319").median()]).shrink_dtype().alias("V319"), col("V320").fill_null([col("V320").median()]).shrink_dtype().alias("V320"), col("V321").fill_null([col("V321").median()]).shrink_dtype().alias("V321"), col("V322").fill_null([col("V322").median()]).shrink_dtype().alias("V322"), col("V323").fill_null([col("V323").median()]).shrink_dtype().alias("V323"), col("V324").fill_null([col("V324").median()]).shrink_dtype().alias("V324"), col("V325").fill_null([col("V325").median()]).shrink_dtype().alias("V325"), col("V326").fill_null([col("V326").median()]).shrink_dtype().alias("V326"), col("V327").fill_null([col("V327").median()]).shrink_dtype().alias("V327"), col("V328").fill_null([col("V328").median()]).shrink_dtype().alias("V328"), col("V329").fill_null([col("V329").median()]).shrink_dtype().alias("V329"), col("V330").fill_null([col("V330").median()]).shrink_dtype().alias("V330"), col("V331").fill_null([col("V331").median()]).shrink_dtype().alias("V331"), col("V332").fill_null([col("V332").median()]).shrink_dtype().alias("V332"), col("V333").fill_null([col("V333").median()]).shrink_dtype().alias("V333"), col("V334").fill_null([col("V334").median()]).shrink_dtype().alias("V334"), col("V335").fill_null([col("V335").median()]).shrink_dtype().alias("V335"), col("V336").fill_null([col("V336").median()]).shrink_dtype().alias("V336"), col("V337").fill_null([col("V337").median()]).shrink_dtype().alias("V337"), col("V338").fill_null([col("V338").median()]).shrink_dtype().alias("V338"), col("V339").fill_null([col("V339").median()]).shrink_dtype().alias("V339")]

        Parquet SCAN data/train_transaction_processed.parquet
        PROJECT */394 COLUMNS

In [None]:
data.describe()