# Utility Functions Examples

**Author:** Luis Paulo Vinatea Barberena  
**Date:** May 21, 2025

This notebook demonstrates how to use the utility functions in `src/utils/` to streamline your data analysis workflows.

## Setup

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import utility functions
import sys
sys.path.append('..')
from src.utils.data_processing import clean_missing_values, detect_outliers, encode_categorical
from src.utils.visualization import plot_correlation_matrix, plot_distribution, plot_missing_values
from src.utils.notebook_utils import create_notebook_template

# For visualization
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# For reproducibility
np.random.seed(42)

## 1. Creating Sample Data

Let's create some sample data to demonstrate the utility functions.

In [None]:
# Create a sample dataset
n_samples = 1000
data = {
    'numeric_1': np.random.normal(0, 1, n_samples),
    'numeric_2': np.random.normal(5, 2, n_samples),
    'numeric_3': np.random.exponential(2, n_samples),
    'category_1': np.random.choice(['A', 'B', 'C', 'D'], n_samples, p=[0.4, 0.3, 0.2, 0.1]),
    'category_2': np.random.choice(['X', 'Y', 'Z'], n_samples, p=[0.6, 0.3, 0.1]),
    'date': pd.date_range(start='2024-01-01', periods=n_samples)
}

# Create DataFrame
df = pd.DataFrame(data)

# Add some correlations
df['numeric_4'] = df['numeric_1'] * 2 + df['numeric_2'] + np.random.normal(0, 0.5, n_samples)

# Add some missing values
for col in df.columns[:4]:  # Skip date column
    mask = np.random.random(n_samples) < 0.1  # 10% missing values
    df.loc[mask, col] = np.nan
    
# Add some outliers
for col in ['numeric_1', 'numeric_2', 'numeric_3']:
    outlier_idx = np.random.choice(n_samples, 5, replace=False)
    df.loc[outlier_idx, col] = np.random.uniform(10, 20, 5)
    
# Display the first few rows
df.head()

## 2. Data Processing Utilities

Let's demonstrate the data processing utilities.

### 2.1 Handling Missing Values

In [None]:
# First, let's check for missing values
print("Missing values in the dataset:")
print(df.isnull().sum())

# Clean missing values using different strategies
print("\nCleaning with different strategies:")
strategies = ['drop', 'mean', 'median', 'fill']
for strategy in strategies:
    kwargs = {'fill_value': 0} if strategy == 'fill' else {}
    df_cleaned = clean_missing_values(df, strategy=strategy, **kwargs)
    print(f"\nStrategy: {strategy}")
    print(f"Original shape: {df.shape}, Cleaned shape: {df_cleaned.shape}")
    print(f"Missing values after cleaning: {df_cleaned.isnull().sum().sum()}")

### 2.2 Detecting Outliers

In [None]:
# Detect outliers using different methods
numeric_cols = ['numeric_1', 'numeric_2', 'numeric_3', 'numeric_4']

# Using IQR method
outliers_iqr = detect_outliers(df, method='iqr', columns=numeric_cols)
print("Outliers detected using IQR method:")
for col in numeric_cols:
    n_outliers = outliers_iqr[col].sum()
    print(f"{col}: {n_outliers} outliers ({n_outliers/len(df)*100:.2f}%)")

# Using Z-score method
outliers_zscore = detect_outliers(df, method='zscore', columns=numeric_cols, threshold=3.0)
print("\nOutliers detected using Z-score method:")
for col in numeric_cols:
    n_outliers = outliers_zscore[col].sum()
    print(f"{col}: {n_outliers} outliers ({n_outliers/len(df)*100:.2f}%)")

# Visualize one column with outliers
col = 'numeric_1'
plt.figure(figsize=(10, 6))
plt.scatter(range(len(df)), df[col], alpha=0.5, label='Data points')
plt.scatter(df.index[outliers_iqr[col]], df.loc[outliers_iqr[col], col], 
            color='red', label='Outliers (IQR)')
plt.title(f'Outlier Detection for {col} using IQR Method')
plt.xlabel('Index')
plt.ylabel(col)
plt.legend()
plt.show()

### 2.3 Encoding Categorical Variables

In [None]:
# Encode categorical variables using different methods
categorical_cols = ['category_1', 'category_2']

# One-hot encoding
df_onehot = encode_categorical(df, columns=categorical_cols, method='onehot')
print("One-hot encoded:")
print(df_onehot.head())
print(f"Original columns: {df.columns.tolist()}")
print(f"Encoded columns: {df_onehot.columns.tolist()}")

# Label encoding
df_label = encode_categorical(df, columns=categorical_cols, method='label')
print("\nLabel encoded:")
print(df_label.head())

# Compare original and encoded dataframes
for col in categorical_cols:
    unique_vals = df[col].unique()
    encoded_vals = df_label[col].unique()
    print(f"\n{col} mapping:")
    for orig, enc in zip(sorted(unique_vals), sorted(encoded_vals)):
        print(f"  {orig} -> {enc}")

## 3. Visualization Utilities

Let's demonstrate the visualization utilities.

### 3.1 Correlation Matrix

In [None]:
# Clean the data first (use median strategy to preserve relationships)
df_clean = clean_missing_values(df, strategy='median')

# Plot correlation matrix
plot_correlation_matrix(df_clean[numeric_cols], figsize=(10, 8), 
                       method='pearson', cmap='coolwarm', 
                       mask_upper=True, annotate=True)

### 3.2 Distribution Plots

In [None]:
# Plot distribution of a numeric column
for col in numeric_cols[:2]:  # Plot first two numeric columns
    plot_distribution(df_clean, column=col, bins=30, kde=True, figsize=(10, 6))

### 3.3 Missing Value Visualization

In [None]:
# Plot missing values in the original dataframe
plot_missing_values(df, figsize=(12, 6))

## 4. Notebook Generation Utility

Let's demonstrate how to create a new notebook template using the utility function.

In [None]:
# Create a new notebook template
output_path = "../notebooks/new_analysis_demo.ipynb"
create_notebook_template(
    output_path=output_path, 
    title="Customer Segmentation Analysis", 
    author="Luis Paulo Vinatea Barberena"
)

print(f"A new notebook template has been created at {output_path}")
print("You can use this utility to quickly create standardized notebook templates.")

## 5. Conclusion

This notebook demonstrated how to use the utility functions from `src/utils/` to streamline your data analysis workflows:

1. **Data Processing Utilities**:
   - Handling missing values with different strategies
   - Detecting and visualizing outliers
   - Encoding categorical variables

2. **Visualization Utilities**:
   - Creating correlation matrices
   - Plotting distributions
   - Visualizing missing values

3. **Notebook Generation Utility**:
   - Creating standardized notebook templates

These utilities help maintain consistency across analyses and save time by eliminating the need to rewrite common code.