# Exploratory Data Analysis (EDA)
## Weather-Driven Disease Outbreak Predictor

This notebook provides exploratory analysis of outbreak data and weather patterns.

In [None]:
# Import required libraries
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import config
from weather_api import WeatherAPI
from features import FeatureEngine

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load Outbreak Data

Load and examine the outbreak dataset.

In [None]:
# Load outbreak data
outbreak_path = os.path.join('..', config.OUTBREAKS_CSV)

if os.path.exists(outbreak_path):
    df = pd.read_csv(outbreak_path)
    df['date'] = pd.to_datetime(df['date'])
    
    print(f"Dataset shape: {df.shape}")
    print(f"\nFirst few rows:")
    display(df.head())
    
    print(f"\nDataset info:")
    display(df.info())
else:
    print(f"Data file not found at {outbreak_path}")
    print("Run: python scripts/data_ingest.py --samples 1000")

## 2. Data Overview and Statistics

In [None]:
if 'df' in locals():
    # Basic statistics
    print("Dataset Statistics:")
    display(df.describe())
    
    # Outbreak distribution
    print(f"\n\nOutbreak Distribution:")
    outbreak_counts = df['outbreak'].value_counts()
    print(f"No Outbreak: {outbreak_counts[0]} ({outbreak_counts[0]/len(df)*100:.1f}%)")
    print(f"Outbreak: {outbreak_counts[1]} ({outbreak_counts[1]/len(df)*100:.1f}%)")
    
    # Disease distribution
    print(f"\n\nDisease Distribution:")
    print(df['disease'].value_counts())

## 3. Visualizations

In [None]:
if 'df' in locals():
    # Outbreak distribution pie chart
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Pie chart
    df['outbreak'].value_counts().plot(
        kind='pie',
        ax=axes[0],
        autopct='%1.1f%%',
        labels=['No Outbreak', 'Outbreak'],
        colors=['#27ae60', '#e74c3c']
    )
    axes[0].set_title('Outbreak Distribution')
    axes[0].set_ylabel('')
    
    # Disease distribution
    df['disease'].value_counts().plot(
        kind='bar',
        ax=axes[1],
        color='#3498db'
    )
    axes[1].set_title('Disease Type Distribution')
    axes[1].set_xlabel('Disease')
    axes[1].set_ylabel('Count')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

## 4. Temporal Patterns

In [None]:
if 'df' in locals():
    # Outbreaks over time
    df_time = df.set_index('date')
    monthly_outbreaks = df_time.resample('M')['outbreak'].sum()
    
    plt.figure(figsize=(14, 5))
    monthly_outbreaks.plot(kind='line', marker='o', color='#e74c3c', linewidth=2)
    plt.title('Monthly Outbreak Count Over Time', fontsize=14, fontweight='bold')
    plt.xlabel('Date')
    plt.ylabel('Number of Outbreaks')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Seasonal analysis
    df['month'] = df['date'].dt.month
    monthly_outbreak_rate = df.groupby('month')['outbreak'].mean()
    
    plt.figure(figsize=(12, 5))
    monthly_outbreak_rate.plot(kind='bar', color='#f39c12')
    plt.title('Average Outbreak Rate by Month', fontsize=14, fontweight='bold')
    plt.xlabel('Month')
    plt.ylabel('Outbreak Rate')
    plt.xticks(range(12), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                            'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()

## 5. Geographic Distribution

In [None]:
if 'df' in locals():
    # Scatter plot of outbreak locations
    fig, ax = plt.subplots(figsize=(12, 8))
    
    outbreak_yes = df[df['outbreak'] == 1]
    outbreak_no = df[df['outbreak'] == 0]
    
    ax.scatter(outbreak_no['lon'], outbreak_no['lat'], 
               c='green', alpha=0.3, s=20, label='No Outbreak')
    ax.scatter(outbreak_yes['lon'], outbreak_yes['lat'], 
               c='red', alpha=0.6, s=50, label='Outbreak')
    
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_title('Geographic Distribution of Outbreaks', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 6. Fetch Sample Weather Data

Demonstrate fetching weather data for analysis.

In [None]:
# Fetch weather for a sample location
weather_api = WeatherAPI()

# Example: Mumbai, India
lat, lon = 19.0760, 72.8777

print(f"Fetching weather data for ({lat}, {lon})...")

try:
    weather_df, location_meta = weather_api.get_weather_for_prediction(lat, lon)
    
    print(f"Weather data fetched: {len(weather_df)} records")
    print(f"Date range: {weather_df.index.min()} to {weather_df.index.max()}")
    print(f"\nLocation metadata:")
    print(location_meta)
    
    print(f"\nWeather data preview:")
    display(weather_df.head(10))
    
except Exception as e:
    print(f"Error fetching weather: {e}")

## 7. Weather Data Visualization

In [None]:
if 'weather_df' in locals():
    # Plot weather variables
    fig, axes = plt.subplots(3, 2, figsize=(15, 12))
    
    weather_df['temperature'].plot(ax=axes[0, 0], color='#e74c3c', linewidth=1)
    axes[0, 0].set_title('Temperature (°C)')
    axes[0, 0].set_ylabel('°C')
    
    weather_df['humidity'].plot(ax=axes[0, 1], color='#3498db', linewidth=1)
    axes[0, 1].set_title('Relative Humidity (%)')
    axes[0, 1].set_ylabel('%')
    
    weather_df['precipitation'].plot(ax=axes[1, 0], color='#9b59b6', linewidth=1)
    axes[1, 0].set_title('Precipitation (mm)')
    axes[1, 0].set_ylabel('mm')
    
    weather_df['wind_speed'].plot(ax=axes[1, 1], color='#1abc9c', linewidth=1)
    axes[1, 1].set_title('Wind Speed (km/h)')
    axes[1, 1].set_ylabel('km/h')
    
    weather_df['pressure'].plot(ax=axes[2, 0], color='#f39c12', linewidth=1)
    axes[2, 0].set_title('Pressure (hPa)')
    axes[2, 0].set_ylabel('hPa')
    
    weather_df['cloud_cover'].plot(ax=axes[2, 1], color='#95a5a6', linewidth=1)
    axes[2, 1].set_title('Cloud Cover (%)')
    axes[2, 1].set_ylabel('%')
    
    for ax in axes.flat:
        ax.grid(True, alpha=0.3)
        ax.set_xlabel('Date/Time')
    
    plt.tight_layout()
    plt.show()

## 8. Feature Engineering Demo

In [None]:
if 'weather_df' in locals():
    # Engineer features
    feature_engine = FeatureEngine()
    feature_df = feature_engine.engineer_features(weather_df)
    
    print(f"Engineered {len(feature_df.columns)} features")
    print(f"\nFeature names:")
    print(feature_df.columns.tolist()[:20])  # Show first 20
    
    print(f"\n\nFeature statistics:")
    display(feature_df.describe())

## 9. Correlation Analysis

In [None]:
if 'feature_df' in locals():
    # Select a subset of features for correlation
    sample_features = [col for col in feature_df.columns if 'daily_mean' in col][:10]
    
    if sample_features:
        corr_matrix = feature_df[sample_features].corr()
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                    center=0, square=True, linewidths=1)
        plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()

## Summary

This notebook demonstrated:
1. Loading and exploring outbreak data
2. Visualizing temporal and geographic patterns
3. Fetching real-time weather data
4. Engineering features for ML models
5. Analyzing correlations between features

Next steps:
- Train ML models with engineered features
- Evaluate model performance
- Deploy predictions via Flask app