# py-iku: Scikit-learn Pipeline Support

This notebook demonstrates how py-iku converts scikit-learn ML pipelines to Dataiku DSS recipes.

## Supported sklearn Components

- **Scalers**: StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, Normalizer
- **Encoders**: LabelEncoder, OneHotEncoder, OrdinalEncoder, LabelBinarizer
- **Imputers**: SimpleImputer, KNNImputer, IterativeImputer
- **Feature Selection**: PCA, TruncatedSVD, SelectKBest, SelectFromModel
- **Utilities**: train_test_split, Pipeline

In [None]:
from py2dataiku import convert
from py2dataiku.parser.ast_analyzer import CodeAnalyzer

## 1. Data Scaling

Scikit-learn scalers are converted to Dataiku's Normalizer processor in Prepare recipes.

In [None]:
scaling_code = '''
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

df = pd.read_csv('features.csv')

# Standard scaling (z-score normalization)
scaler = StandardScaler()
df[['feature1', 'feature2']] = scaler.fit_transform(df[['feature1', 'feature2']])

# Min-Max scaling (0-1 range)
minmax = MinMaxScaler()
df[['feature3']] = minmax.fit_transform(df[['feature3']])

# Robust scaling (handles outliers)
robust = RobustScaler()
df[['feature4']] = robust.fit_transform(df[['feature4']])

df.to_csv('scaled_features.csv', index=False)
'''

# Analyze the transformations
analyzer = CodeAnalyzer()
transformations = analyzer.analyze(scaling_code)

print("Scaling Transformations Detected:")
for t in transformations:
    if 'scal' in str(t.notes).lower() or 'scal' in str(t.parameters).lower():
        print(f"  - Type: {t.transformation_type.value}")
        print(f"    Parameters: {t.parameters}")

In [None]:
# Convert to flow
flow = convert(scaling_code)
print(flow.get_summary())

## 2. Categorical Encoding

Label and one-hot encoding are converted to Dataiku's categorical encoding processors.

In [None]:
encoding_code = '''
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

df = pd.read_csv('categorical_data.csv')

# Label encoding for ordinal categories
le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['category'])

# One-hot encoding for nominal categories
ohe = OneHotEncoder(sparse=False)
encoded = ohe.fit_transform(df[['region']])

# Ordinal encoding with custom order
oe = OrdinalEncoder(categories=[['low', 'medium', 'high']])
df['priority_encoded'] = oe.fit_transform(df[['priority']])

df.to_csv('encoded_data.csv', index=False)
'''

flow = convert(encoding_code)
print(flow.visualize(format='ascii'))

## 3. Missing Value Imputation

Scikit-learn imputers map to Dataiku's fill empty processors.

In [None]:
imputation_code = '''
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer

df = pd.read_csv('data_with_missing.csv')

# Mean imputation
mean_imputer = SimpleImputer(strategy='mean')
df[['numeric_col']] = mean_imputer.fit_transform(df[['numeric_col']])

# Median imputation
median_imputer = SimpleImputer(strategy='median')
df[['income']] = median_imputer.fit_transform(df[['income']])

# Most frequent (mode) imputation
mode_imputer = SimpleImputer(strategy='most_frequent')
df[['category']] = mode_imputer.fit_transform(df[['category']])

# Constant value imputation
const_imputer = SimpleImputer(strategy='constant', fill_value=0)
df[['optional_field']] = const_imputer.fit_transform(df[['optional_field']])

# KNN-based imputation (requires Python recipe in Dataiku)
knn_imputer = KNNImputer(n_neighbors=5)
df[['feature1', 'feature2']] = knn_imputer.fit_transform(df[['feature1', 'feature2']])

df.to_csv('imputed_data.csv', index=False)
'''

analyzer = CodeAnalyzer()
transformations = analyzer.analyze(imputation_code)

print("Imputation Transformations:")
for t in transformations:
    if 'impute' in str(t.notes).lower() or 'impute' in str(t.parameters).lower():
        print(f"  - {t.transformation_type.value}")
        print(f"    Strategy: {t.parameters.get('strategy', 'N/A')}")
        print(f"    Notes: {t.notes}")

## 4. Train-Test Split

`train_test_split` is converted to a Dataiku Split recipe.

In [None]:
split_code = '''
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('ml_dataset.csv')

# Split into training and test sets
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

# Save splits
X_train.to_csv('train_features.csv', index=False)
X_test.to_csv('test_features.csv', index=False)
'''

flow = convert(split_code)
print("Train-Test Split Flow:")
print(flow.visualize(format='ascii'))

## 5. Feature Selection

Dimensionality reduction and feature selection components.

In [None]:
feature_selection_code = '''
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest, f_classif

df = pd.read_csv('high_dim_data.csv')
X = df.drop('target', axis=1)
y = df['target']

# PCA for dimensionality reduction
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)

# TruncatedSVD for sparse data
svd = TruncatedSVD(n_components=5)
X_svd = svd.fit_transform(X)

# Select K best features
selector = SelectKBest(score_func=f_classif, k=20)
X_selected = selector.fit_transform(X, y)

# Save reduced features
pd.DataFrame(X_pca).to_csv('pca_features.csv', index=False)
'''

analyzer = CodeAnalyzer()
transformations = analyzer.analyze(feature_selection_code)

print("Feature Selection Operations:")
for t in transformations:
    if 'pca' in str(t.notes).lower() or 'select' in str(t.notes).lower():
        print(f"  - {t.transformation_type.value}")
        print(f"    Parameters: {t.parameters}")
        print(f"    Notes: {t.notes}")

## 6. Complete ML Pipeline

A full ML preprocessing pipeline combining multiple sklearn components.

In [None]:
full_pipeline_code = '''
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Load raw data
df = pd.read_csv('raw_ml_data.csv')

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Handle missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X.select_dtypes(include=[np.number]))

# Encode categorical target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Reduce dimensionality
pca = PCA(n_components=0.95)  # Keep 95% variance
X_reduced = pca.fit_transform(X_scaled)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, y_encoded, 
    test_size=0.2, 
    random_state=42
)

# Save processed data
pd.DataFrame(X_train).to_csv('X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('X_test.csv', index=False)
pd.DataFrame(y_train).to_csv('y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('y_test.csv', index=False)
'''

flow = convert(full_pipeline_code)
print("Complete ML Pipeline:")
print(flow.get_summary())
print("\n" + flow.visualize(format='ascii'))

## 7. sklearn Pipeline Object

sklearn's Pipeline object is also recognized.

In [None]:
sklearn_pipeline_code = '''
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

df = pd.read_csv('data.csv')
X = df.drop('target', axis=1)

# Create sklearn Pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=10))
])

# Fit and transform
X_processed = pipeline.fit_transform(X)

pd.DataFrame(X_processed).to_csv('processed_data.csv', index=False)
'''

analyzer = CodeAnalyzer()
transformations = analyzer.analyze(sklearn_pipeline_code)

print("sklearn Pipeline Detected:")
for t in transformations:
    if 'pipeline' in str(t.notes).lower():
        print(f"  - {t.transformation_type.value}")
        print(f"    Notes: {t.notes}")

## 8. Dataiku Recipe Mappings

How sklearn components map to Dataiku recipes:

In [None]:
mappings = {
    "sklearn Component": [
        "StandardScaler",
        "MinMaxScaler",
        "RobustScaler",
        "LabelEncoder",
        "OneHotEncoder",
        "SimpleImputer(mean)",
        "SimpleImputer(median)",
        "SimpleImputer(constant)",
        "KNNImputer",
        "train_test_split",
        "PCA",
        "SelectKBest",
        "Pipeline"
    ],
    "Dataiku Recipe/Processor": [
        "Prepare > Normalizer (Z_SCORE)",
        "Prepare > Normalizer (MIN_MAX)",
        "Prepare > Normalizer (ROBUST)",
        "Prepare > CategoricalEncoder",
        "Prepare > CategoricalEncoder (ONE_HOT)",
        "Prepare > FillEmptyWithComputedValue (MEAN)",
        "Prepare > FillEmptyWithComputedValue (MEDIAN)",
        "Prepare > FillEmptyWithValue",
        "Python Recipe (advanced imputation)",
        "Split Recipe",
        "Python Recipe (dimensionality reduction)",
        "Python Recipe (feature selection)",
        "Python Recipe (multi-step pipeline)"
    ]
}

import pandas as pd
pd.DataFrame(mappings)

## Next Steps

- See `04_visualizations.ipynb` for visualization formats
- See `05_advanced_features.ipynb` for plugins and DSS export