# Data Profiling & Exploratory Data Analysis (EDA)

This notebook demonstrates the new EDA capabilities of MKYZ v0.2.0, including automatic data profiling and recommendation generation.

In [None]:
import mkyz
import pandas as pd
import numpy as np

# Create a sample dataset with some issues (missing values, outliers, correlation)
np.random.seed(42)
data = {
    'age': np.random.randint(18, 80, 1000).astype(float),
    'income': np.random.normal(50000, 15000, 1000),
    'category': np.random.choice(['A', 'B', 'C', None], 1000),
    'score': np.random.uniform(0, 100, 1000),
    'label': np.random.choice([0, 1], 1000, p=[0.9, 0.1]) # Imbalanced
}
df = pd.DataFrame(data)

# Add some missing values
df.loc[np.random.choice(df.index, 50), 'age'] = np.nan

# Add outliers
df.loc[0:5, 'income'] = 1000000

# Add high correlation
df['income_noisy'] = df['income'] * 1.1 + np.random.normal(0, 1000, 1000)

## 1. Quick Data Info

Get a high-level summary of the dataset.

In [None]:
info = mkyz.data_info(df)
for key, val in info.items():
    print(f"{key}: {val}")

## 2. Quick EDA Report

Print a formatted summary and recommendations to the console.

In [None]:
mkyz.quick_eda(df, target_column='label')

## 3. High-Level Profiling with DataProfile

Use the `DataProfile` class for more granular control.

In [None]:
profile = mkyz.DataProfile(df, target_column='label')
profile.generate()

print("Top Recommendations:")
for rec in profile.get_recommendations():
    print(f"- {rec}")

## 4. Column Detail

Deep dive into a specific column's statistics.

In [None]:
age_info = profile.get_column_info('age')
print("Age Statistics:")
for k, v in age_info.items():
    if k != 'stats':
        print(f"{k}: {v}")

## 5. Export HTML Report

Generate a beautiful standalone HTML report.

In [None]:
report_path = profile.export_report('eda_report.html')
print(f"Report saved to: {report_path}")