# 02 - Data Cleaning & Harmonization

This notebook cleans and harmonizes raw USITC trade data for analysis.

**Key Steps:**
1. Load raw USITC data files (wide format)
2. Transform from wide to long format
3. Standardize country names
4. Apply inflation adjustment (convert to real dollars)
5. Calculate derived metrics (shares, growth rates)
6. Save processed dataset

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from data_loader import save_processed_data, DATA_RAW, DATA_PROCESSED, DATA_REFERENCE
from classification_mapper import standardize_country_names, add_historical_period
from transformers import calculate_country_shares, calculate_yoy_growth, adjust_for_inflation

print("Modules loaded successfully")

## Step 1: Load Raw USITC Data Files

In [None]:
# Define file paths
usitc_dir = DATA_RAW / 'usitc'

imports_file = usitc_dir / 'imports_1995_2024.csv'
exports_file = usitc_dir / 'exports_1995_2024.csv'

print(f"Imports file exists: {imports_file.exists()}")
print(f"Exports file exists: {exports_file.exists()}")

In [None]:
# Load raw wide-format data
imports_wide = pd.read_csv(imports_file)
exports_wide = pd.read_csv(exports_file)

print(f"Imports: {imports_wide.shape[0]} countries, {imports_wide.shape[1]} columns")
print(f"Exports: {exports_wide.shape[0]} countries, {exports_wide.shape[1]} columns")

print("\nImports sample:")
display(imports_wide.head())

print("\nExports sample:")
display(exports_wide.head())

## Step 2: Transform from Wide to Long Format

In [None]:
def wide_to_long(df_wide: pd.DataFrame, trade_type: str) -> pd.DataFrame:
    """
    Transform USITC wide-format data to long format.
    
    Args:
        df_wide: DataFrame with Country column and year columns (1995, 1996, ...)
        trade_type: 'import' or 'export'
    
    Returns:
        Long-format DataFrame with columns: country, year, value, trade_type
    """
    # Get the country column name (first column)
    country_col = df_wide.columns[0]
    
    # Get year columns (all numeric column names)
    year_cols = [c for c in df_wide.columns[1:] if str(c).replace('.', '').isdigit()]
    
    # Filter to only country and year columns
    df_subset = df_wide[[country_col] + year_cols].copy()
    
    # Melt from wide to long
    df_long = df_subset.melt(
        id_vars=[country_col],
        var_name='year',
        value_name='value'
    )
    
    # Rename country column
    df_long = df_long.rename(columns={country_col: 'country'})
    
    # Add trade type
    df_long['trade_type'] = trade_type
    
    # Convert year to integer
    df_long['year'] = df_long['year'].astype(int)
    
    # Convert value to numeric (handle any formatting issues)
    df_long['value'] = pd.to_numeric(
        df_long['value'].astype(str).str.replace(',', '').str.replace('"', ''),
        errors='coerce'
    )
    
    # Filter out Total row and other non-country rows
    exclude_rows = ['Total:', 'Total', 'Unspecified', 'Transshipment', 'Internat Organization']
    df_long = df_long[~df_long['country'].str.strip().isin(exclude_rows)]
    
    # Convert from billions to actual USD
    df_long['value'] = df_long['value'] * 1e9
    
    return df_long

# Transform both datasets
imports_long = wide_to_long(imports_wide, 'import')
exports_long = wide_to_long(exports_wide, 'export')

print(f"Imports (long): {len(imports_long):,} rows")
print(f"Exports (long): {len(exports_long):,} rows")

print("\nImports sample:")
display(imports_long.head(10))

In [None]:
# Combine imports and exports into single DataFrame
trade_df = pd.concat([imports_long, exports_long], ignore_index=True)

print(f"Combined dataset: {len(trade_df):,} rows")
print(f"\nTrade type distribution:")
print(trade_df['trade_type'].value_counts())

print(f"\nYear range: {trade_df['year'].min()} - {trade_df['year'].max()}")
print(f"Unique countries: {trade_df['country'].nunique()}")

## Step 3: Standardize Country Names

In [None]:
# Apply country name standardization
trade_df = standardize_country_names(trade_df, country_col='country')

print(f"Unique countries after standardization: {trade_df['country'].nunique()}")

# Show top countries by total trade value
top_countries = trade_df.groupby('country')['value'].sum().sort_values(ascending=False).head(20)
print("\nTop 20 trading partners (total trade 1995-2024):")
for i, (country, value) in enumerate(top_countries.items(), 1):
    print(f"  {i:2}. {country}: ${value/1e12:.2f} trillion")

## Step 4: Apply Inflation Adjustment

In [None]:
# Load GDP deflator
deflator_df = pd.read_csv(DATA_REFERENCE / 'gdp_deflator.csv')
print(f"GDP Deflator loaded: {len(deflator_df)} years")
display(deflator_df.tail(10))

In [None]:
# Apply inflation adjustment (base year 2020)
trade_df = adjust_for_inflation(
    trade_df,
    deflator_df,
    value_col='value',
    year_col='year',
    base_year=2020
)

print("Sample with real values:")
sample = trade_df[trade_df['country'] == 'China'].sort_values('year')
display(sample[['year', 'country', 'trade_type', 'value', 'value_real']].head(10))

## Step 5: Calculate Derived Metrics

In [None]:
# Calculate country shares (% of total trade by year and trade type)
trade_df = calculate_country_shares(
    trade_df,
    value_col='value_real',
    country_col='country',
    year_col='year',
    trade_type_col='trade_type'
)

print("Top import sources in 2024 (by share):")
imports_2024 = trade_df[(trade_df['year'] == 2024) & (trade_df['trade_type'] == 'import')]
imports_2024_top = imports_2024.nlargest(10, 'share_pct')[['country', 'value_real', 'share_pct']]
display(imports_2024_top)

In [None]:
# Calculate year-over-year growth rates
trade_df = calculate_yoy_growth(
    trade_df,
    value_col='value_real',
    country_col='country',
    year_col='year',
    trade_type_col='trade_type'
)

print("China import growth over time:")
china_imports = trade_df[(trade_df['country'] == 'China') & (trade_df['trade_type'] == 'import')].sort_values('year')
display(china_imports[['year', 'value_real', 'share_pct', 'yoy_growth_pct']].tail(10))

In [None]:
# Add historical period classification
trade_df = add_historical_period(trade_df, year_col='year')

print("Historical periods in data:")
print(trade_df.groupby('period')['year'].agg(['min', 'max', 'count']))

In [None]:
# Final columns
print("Final columns:")
print(trade_df.columns.tolist())

print(f"\nFinal dataset shape: {trade_df.shape}")
print(f"Memory usage: {trade_df.memory_usage(deep=True).sum() / 1e6:.1f} MB")

## Step 6: Data Validation

In [None]:
# Check for missing values
print("Missing values:")
print(trade_df.isnull().sum())

# Check for negative values
print(f"\nNegative trade values: {(trade_df['value'] < 0).sum()}")

# Check year coverage
print(f"\nYears covered: {sorted(trade_df['year'].unique())}")

In [None]:
# Verify total trade roughly matches known values
print("Total US imports by year (billions USD, nominal):")
annual_imports = trade_df[trade_df['trade_type'] == 'import'].groupby('year')['value'].sum() / 1e9
print(annual_imports.tail(10).round(1))

## Step 7: Save Processed Data

In [None]:
# Ensure output directory exists
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

# Save combined processed data
output_file = DATA_PROCESSED / 'trade_data_1995_2024.csv'
trade_df.to_csv(output_file, index=False)
print(f"Saved processed data to: {output_file}")
print(f"File size: {output_file.stat().st_size / 1e6:.1f} MB")

# Also save separate imports and exports files
imports_processed = trade_df[trade_df['trade_type'] == 'import']
exports_processed = trade_df[trade_df['trade_type'] == 'export']

imports_processed.to_csv(DATA_PROCESSED / 'imports_processed.csv', index=False)
exports_processed.to_csv(DATA_PROCESSED / 'exports_processed.csv', index=False)

print(f"\nSaved {len(imports_processed):,} import records")
print(f"Saved {len(exports_processed):,} export records")

## Summary

In [None]:
print("=" * 60)
print("DATA PROCESSING COMPLETE")
print("=" * 60)
print(f"\nTotal records: {len(trade_df):,}")
print(f"Years: {trade_df['year'].min()} - {trade_df['year'].max()}")
print(f"Countries: {trade_df['country'].nunique()}")
print(f"Trade types: {trade_df['trade_type'].unique().tolist()}")
print(f"\nOutput files:")
print(f"  - {DATA_PROCESSED / 'trade_data_1995_2024.csv'}")
print(f"  - {DATA_PROCESSED / 'imports_processed.csv'}")
print(f"  - {DATA_PROCESSED / 'exports_processed.csv'}")
print("\n-> Proceed to 03_exploratory_analysis.ipynb")

# End of notebook