# py-iku: NumPy Operations Support

This notebook demonstrates how py-iku converts NumPy operations to Dataiku DSS recipes and processors.

## Supported NumPy Operations

py-iku supports 30+ NumPy functions across these categories:
- Mathematical: log, exp, sqrt, power, abs, round
- Conditional: where, isnan, isinf, isfinite
- Aggregations: sum, mean, std, var, min, max, median
- Array operations: concatenate, vstack, hstack, sort, unique

In [None]:
from py2dataiku import convert
from py2dataiku.parser.ast_analyzer import CodeAnalyzer

## 1. Mathematical Transformations

NumPy math functions are converted to Dataiku Prepare recipe steps with formula processors.

In [None]:
math_code = '''
import numpy as np
import pandas as pd

df = pd.read_csv('metrics.csv')

# Logarithmic transformations
df['log_value'] = np.log(df['value'])
df['log10_value'] = np.log10(df['value'])
df['log1p_value'] = np.log1p(df['value'])  # log(1 + x)

# Exponential
df['exp_rate'] = np.exp(df['rate'])

# Power functions
df['sqrt_value'] = np.sqrt(df['value'])
df['squared'] = np.power(df['value'], 2)
df['cubed'] = np.power(df['value'], 3)

df.to_csv('transformed_metrics.csv', index=False)
'''

# Analyze the transformations
analyzer = CodeAnalyzer()
transformations = analyzer.analyze(math_code)

print("Detected Transformations:")
for t in transformations:
    print(f"  - {t.transformation_type.value}: {t.parameters}")

In [None]:
# Convert to flow
flow = convert(math_code)
print(flow.get_summary())

## 2. Rounding and Clipping

NumPy's rounding and clipping functions map to Dataiku's ROUND_COLUMN and CLIP_COLUMN processors.

In [None]:
round_clip_code = '''
import numpy as np
import pandas as pd

df = pd.read_csv('scores.csv')

# Rounding operations
df['score_rounded'] = np.round(df['score'], 2)
df['score_floor'] = np.floor(df['score'])
df['score_ceil'] = np.ceil(df['score'])

# Clipping (constraining values)
df['score_clipped'] = np.clip(df['score'], 0, 100)  # Between 0 and 100
df['positive_only'] = np.clip(df['value'], 0, None)  # Minimum 0

# Absolute value
df['abs_diff'] = np.abs(df['actual'] - df['predicted'])

df.to_csv('processed_scores.csv', index=False)
'''

flow = convert(round_clip_code)
print(flow.visualize(format='ascii'))

## 3. Conditional Operations

`np.where()` is converted to Dataiku's formula-based column creation with if/else logic.

In [None]:
conditional_code = '''
import numpy as np
import pandas as pd

df = pd.read_csv('transactions.csv')

# Conditional column creation with np.where
df['category'] = np.where(df['amount'] > 1000, 'high_value', 'standard')
df['is_positive'] = np.where(df['balance'] > 0, 'positive', 'negative')

# NaN checking
df['has_missing'] = np.isnan(df['optional_field'])
df['is_valid'] = np.isfinite(df['ratio'])

# Replace NaN values
df['cleaned_ratio'] = np.nan_to_num(df['ratio'], nan=0.0)

df.to_csv('categorized_transactions.csv', index=False)
'''

# Analyze
analyzer = CodeAnalyzer()
transformations = analyzer.analyze(conditional_code)

print("Conditional Transformations:")
for t in transformations:
    if 'where' in str(t.notes).lower() or 'is' in str(t.parameters).lower():
        print(f"  - {t.transformation_type.value}")
        print(f"    Parameters: {t.parameters}")
        print(f"    Notes: {t.notes}")

## 4. Aggregation Functions

NumPy aggregations are detected and can inform Grouping recipe creation.

In [None]:
agg_code = '''
import numpy as np
import pandas as pd

df = pd.read_csv('sales.csv')

# Calculate statistics using NumPy
total_sales = np.sum(df['amount'])
avg_sale = np.mean(df['amount'])
std_sale = np.std(df['amount'])
median_sale = np.median(df['amount'])

# Percentiles
p25 = np.percentile(df['amount'], 25)
p75 = np.percentile(df['amount'], 75)
p90 = np.percentile(df['amount'], 90)

# Min/Max
min_sale = np.min(df['amount'])
max_sale = np.max(df['amount'])

print(f"Statistics computed")
'''

flow = convert(agg_code)
print(flow.get_summary())

## 5. Feature Engineering Pipeline

A complete feature engineering pipeline combining NumPy and pandas.

In [None]:
feature_code = '''
import numpy as np
import pandas as pd

df = pd.read_csv('raw_features.csv')

# Log transform skewed features
df['log_income'] = np.log1p(df['income'])
df['log_transactions'] = np.log1p(df['num_transactions'])

# Normalize features
df['income_normalized'] = (df['income'] - np.mean(df['income'])) / np.std(df['income'])

# Clip outliers
df['income_clipped'] = np.clip(df['income'], 
                                np.percentile(df['income'], 1),
                                np.percentile(df['income'], 99))

# Handle special values
df['ratio_clean'] = np.nan_to_num(df['ratio'], nan=0, posinf=1, neginf=-1)
df['is_valid'] = np.isfinite(df['score']).astype(int)

# Create categorical flags
df['high_income'] = np.where(df['income'] > 100000, 1, 0)
df['active_user'] = np.where(df['num_transactions'] > 10, 1, 0)

# Save engineered features
df.to_csv('engineered_features.csv', index=False)
'''

flow = convert(feature_code)
print("Feature Engineering Flow:")
print(flow.visualize(format='ascii'))

## 6. Array Operations

NumPy array operations like concatenate and unique are also supported.

In [None]:
array_code = '''
import numpy as np
import pandas as pd

# Load multiple datasets
df1 = pd.read_csv('data_2022.csv')
df2 = pd.read_csv('data_2023.csv')

# Vertical stack (concatenate)
combined = np.vstack([df1.values, df2.values])

# Get unique values
unique_categories = np.unique(df1['category'].values)

# Sort array
sorted_values = np.sort(df1['value'].values)

print("Array operations complete")
'''

analyzer = CodeAnalyzer()
transformations = analyzer.analyze(array_code)

print("Array Operations Detected:")
for t in transformations:
    print(f"  - {t.transformation_type.value}: {t.notes}")

## 7. Supported NumPy Functions Reference

Here's a complete list of supported NumPy functions:

In [None]:
numpy_functions = {
    "Mathematical": [
        "np.log", "np.log10", "np.log2", "np.log1p",
        "np.exp", "np.expm1",
        "np.sqrt", "np.cbrt", "np.square", "np.power",
        "np.abs", "np.absolute"
    ],
    "Rounding": [
        "np.round", "np.around", "np.rint",
        "np.floor", "np.ceil", "np.trunc"
    ],
    "Clipping": ["np.clip"],
    "Conditional": [
        "np.where",
        "np.isnan", "np.isinf", "np.isfinite",
        "np.nan_to_num", "np.nanmean", "np.nansum", "np.nanstd"
    ],
    "Aggregation": [
        "np.sum", "np.mean", "np.std", "np.var",
        "np.min", "np.max", "np.median",
        "np.percentile", "np.quantile"
    ],
    "Array Operations": [
        "np.concatenate", "np.vstack", "np.hstack", "np.stack",
        "np.sort", "np.argsort", "np.unique"
    ],
    "Reshaping": [
        "np.reshape", "np.flatten", "np.ravel", "np.transpose"
    ],
    "Creation": [
        "np.zeros", "np.ones", "np.full", "np.empty",
        "np.arange", "np.linspace"
    ]
}

for category, functions in numpy_functions.items():
    print(f"\n{category}:")
    for func in functions:
        print(f"  - {func}")

## Next Steps

- See `03_sklearn_pipelines.ipynb` for scikit-learn ML pipeline support
- See `04_visualizations.ipynb` for visualization options