# Imports

In [2]:
!pip install skimpy

Collecting skimpy
  Downloading skimpy-0.0.18-py3-none-any.whl.metadata (34 kB)
Collecting ipykernel>=6.29.5 (from skimpy)
  Downloading ipykernel-6.29.5-py3-none-any.whl.metadata (6.3 kB)
Collecting numpy>=2.0.2 (from skimpy)
  Downloading numpy-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m420.0 kB/s[0m eta [36m0:00:00[0m
Collecting pandas>=2.2.3 (from skimpy)
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting polars>=1.17.1 (from skimpy)
  Downloading polars-1.24.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting comm>=0.1.1 (from ipykernel>=6.29.5->skimpy)
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting jedi>=0.16 (

In [3]:
import pandas as pd
from skimpy import skim
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import joblib

# Fetch dataset, save in df

In [4]:
url = 'https://proai-datasets.s3.eu-west-3.amazonaws.com/sample_dataset.csv'

In [39]:
df = pd.read_csv(url)

# EDA

In [6]:
skim(df)

- Missing values up to 32% per col --> different imputing strategies because of swekness
- StandardScaler for every numerical feature to make feature more standard (required by most of models)
- area error is a string, but seems to contain only A, B, C as values

# Pipelines

In [40]:
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

In [43]:
if 'target' in numeric_features:
    numeric_features.remove('target')

In [48]:
skewness = df[numeric_features].skew().abs()
skewed_features = skewness[skewness > 0.5].index.tolist()
symmetric_features = list(set(numeric_features) - set(skewed_features))

In [52]:
def discretize_columns(X):

    """Apply np.digitize() col per col to avoid dimension error."""

    X = np.asarray(X)  # Convert to NumPy array
    X_transformed = np.zeros_like(X)  # Create an empy array
    for i in range(X.shape[1]):
        col = X[:, i]  # Take i colomn
        if np.isnan(col).all():  # If all col NaN, skip
            continue
        bins = np.linspace(np.nanmin(col), np.nanmax(col), 20)  # Create bins
        X_transformed[:, i] = np.digitize(col, bins, right=True)  # np.digitize()
    return X_transformed

discretizer = FunctionTransformer(discretize_columns, validate=False)

In [53]:
# Pipeline 1: Pre-processing record target = 1
df_target1 = df[df['target'] == 1].copy()

pipeline_1 = ColumnTransformer([
    ('impute_skewed', SimpleImputer(strategy='median'), skewed_features if skewed_features else []),
    ('impute_symmetric', SimpleImputer(strategy='mean'), symmetric_features if symmetric_features else []),
    ('log_transform', FunctionTransformer(np.log1p, validate=False), skewed_features if skewed_features else []),
    ('scaler', StandardScaler(), numeric_features if numeric_features else [])
])

In [54]:
# Pipeline 2: Pre-processing all records
pipeline_2 = ColumnTransformer([
    ('imputer', SimpleImputer(strategy='mean'), numeric_features if numeric_features else []),
    ('discretize', discretizer, numeric_features if numeric_features else []),
    ('ordinal_encoding', OneHotEncoder(handle_unknown='ignore'), categorical_features if categorical_features else [])
])

In [55]:
# Pipeline 3: Pre-processing numerical
pipeline_3 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('log_transform', FunctionTransformer(np.log1p, validate=False)),
    ('pca', PCA(n_components=0.8)),  # PCA dimensionality reduction
    ('scaler', StandardScaler())
])

In [56]:
# Final ColumnTransformer
final_preprocessor = ColumnTransformer([
    ('pipeline_1', pipeline_1, numeric_features if numeric_features else []),
    ('pipeline_2', pipeline_2, numeric_features + categorical_features if numeric_features + categorical_features else []),
    ('pipeline_3', pipeline_3, numeric_features if numeric_features else [])
])

In [57]:
final_pipeline = Pipeline([
    ('preprocessor', final_preprocessor)
])

In [58]:
transformed_data = final_pipeline.fit_transform(df)

In [59]:
joblib.dump(final_pipeline, 'final_pipeline.pkl')

['final_pipeline.pkl']