# 03 - Exploratory Data Analysis

Explore the cleaned US trade data to understand patterns, top trading partners, and data quality.

**Key Analyses:**
1. Overview of trade volumes over time
2. Top trading partners
3. Trade concentration (HHI)
4. Historical period comparisons

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
import sys

sys.path.insert(0, str(Path.cwd().parent / 'src'))

from data_loader import DATA_PROCESSED, DATA_REFERENCE
from analysis import calculate_hhi

# Plotly display settings
import plotly.io as pio
pio.templates.default = "plotly_white"

print("Libraries loaded")

## Load Processed Data

In [None]:
# Load processed data (uncomment when data is available)
# imports_df = pd.read_csv(DATA_PROCESSED / 'imports_processed.csv')
# exports_df = pd.read_csv(DATA_PROCESSED / 'exports_processed.csv')

# For now, show expected data structure
print("Expected columns in processed data:")
print("  - year: int (1995-2025)")
print("  - country: str (standardized country names)")
print("  - value: float (nominal USD)")
print("  - value_real: float (real USD, 2017 base)")
print("  - share: float (0-1)")
print("  - share_pct: float (0-100)")
print("  - yoy_growth: float")
print("  - period: str (historical period)")

## 1. Total Trade Over Time

## 2. Top Trading Partners

## 3. Country Shares Over Time

## 4. Trade Concentration (HHI)

## 5. Stacked Area Chart

See visualization code in following cells.

## Next Steps
Proceed to **04_country_shift_analysis.ipynb** for detailed country shift analysis.

In [None]:
# Key visualization functions for EDA

def get_top_partners(df, year, n=10):
    """Get top N trading partners for a given year."""
    year_data = df[df['year'] == year]
    top = year_data.nlargest(n, 'value_real')[['country', 'value_real', 'share_pct']]
    return top

def plot_country_shares(df, countries, title="Country Import Shares Over Time"):
    """Plot share trends for selected countries."""
    filtered = df[df['country'].isin(countries)]
    fig = px.line(
        filtered, x='year', y='share_pct', color='country',
        title=title,
        labels={'share_pct': 'Share of Total Imports (%)', 'year': 'Year'}
    )
    return fig

def plot_stacked_shares(df, top_n=10, title="Import Source Composition Over Time"):
    """Create stacked area chart of top countries' shares."""
    top_countries = df.groupby('country')['value_real'].sum().nlargest(top_n).index.tolist()
    filtered = df[df['country'].isin(top_countries)].copy()
    pivot = filtered.pivot_table(index='year', columns='country', values='share_pct', aggfunc='sum').fillna(0)
    
    fig = go.Figure()
    for country in pivot.columns:
        fig.add_trace(go.Scatter(x=pivot.index, y=pivot[country], name=country, stackgroup='one', mode='none'))
    fig.update_layout(title=title, xaxis_title='Year', yaxis_title='Share (%)')
    return fig

# Key countries to track
key_countries = ['China', 'Mexico', 'Canada', 'Japan', 'Germany', 
                 'Vietnam', 'South Korea', 'Taiwan', 'India', 'United Kingdom']

print("Visualization functions defined")