# Exploratory Data Analysis

This notebook demonstrates basic exploratory data analysis for the disco-baa-01 project.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from disco_baa_01.utils import describe_data, set_random_seed

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

# Set random seed for reproducibility
set_random_seed(42)

## 1. Data Loading

Load your data here. For this example, we'll create sample data.

In [None]:
# Create sample sheep data
n_samples = 100

data = {
    'sheep_id': range(1, n_samples + 1),
    'weight_kg': np.random.normal(70, 10, n_samples),
    'wool_weight_kg': np.random.normal(5, 1, n_samples),
    'age_years': np.random.randint(1, 10, n_samples),
    'breed': np.random.choice(['Merino', 'Suffolk', 'Dorset', 'Romney'], n_samples)
}

df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Data Overview

In [None]:
# Get basic statistics
stats = describe_data(df)
print("Data Statistics:")
for key, value in stats.items():
    print(f"\n{key}:")
    print(value)

In [None]:
# Display summary statistics
df.describe()

## 3. Data Visualization

In [None]:
# Distribution of sheep weights
plt.figure(figsize=(10, 6))
plt.hist(df['weight_kg'], bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Weight (kg)')
plt.ylabel('Frequency')
plt.title('Distribution of Sheep Weights')
plt.show()

In [None]:
# Relationship between weight and wool weight
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='weight_kg', y='wool_weight_kg', hue='breed', alpha=0.6)
plt.xlabel('Weight (kg)')
plt.ylabel('Wool Weight (kg)')
plt.title('Sheep Weight vs Wool Weight by Breed')
plt.show()

In [None]:
# Breed distribution
plt.figure(figsize=(10, 6))
df['breed'].value_counts().plot(kind='bar', edgecolor='black')
plt.xlabel('Breed')
plt.ylabel('Count')
plt.title('Distribution of Sheep Breeds')
plt.xticks(rotation=45)
plt.show()

## 4. Next Steps

- Replace sample data with real data
- Perform feature engineering
- Build predictive models
- Evaluate model performance