# Data Exploration Notebook
## ML Model Serving API - Project 5

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

# Load sample data
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("Dataset shape:", df.shape)
df.head()

In [None]:
# Basic statistics
print("Basic Statistics:")
print(df.describe())

print("\nTarget distribution:")
print(df['species'].value_counts())

In [None]:
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for i, feature in enumerate(iris.feature_names):
    row, col = i // 2, i % 2
    for species in df['species'].unique():
        species_data = df[df['species'] == species][feature]
        axes[row, col].hist(species_data, alpha=0.7, label=species)
    axes[row, col].set_title(f'Distribution of {feature}')
    axes[row, col].legend()

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(10, 8))
corr_matrix = df[iris.feature_names].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
# Save processed data
df.to_csv('../data/processed/iris_processed.csv', index=False)
print("Data saved to ../data/processed/iris_processed.csv")