In [3]:
import pandas as pd
import numpy as np

# Loading the combined climate litigation data
df = pd.read_csv('combined_climate_data (1).csv')

# 2. TAXONOMY MAPPING (sectors_unified)

def get_sectors_unified(row):
    """Maps binary indicators to standardized NACE names."""
    mapping = {
        'agriculture': "1. Agriculture & Livestock",
        'forestry': "2. Forestry Timber & Logging",
        'fishing': "4. Fishing & Aquaculture",
        'extractive': "5. Mining & Quarrying",
        'manufacturing': "10. Manufacturing & Processing",
        'energy_supply': "9. Fossil Energy & Utilities",
        'water_management': "9. Fossil Energy & Utilities",
        'enviromental_management': "3. Forestry - Carbon & Conservation",
        'construction': "11. Construction & Real Estate",
        'trade': "14. Finance & Business Services",
        'transportation_logistics': "12. Transportation & Logistics",
        'media': "13. ICT & Media",
        'finance_insurance': "14. Finance & Business Services",
        'public': "16. Public & Social Services",
        'education': "16. Public & Social Services",
        'informal': "16. Public & Social Services"
    }
    matches = [name for col, name in mapping.items() if row[col] == 1]
    return "; ".join(sorted(set(matches))) if matches else "N/A"

df['sectors_unified'] = df.apply(get_sectors_unified, axis=1)

# 3. BINARY ENCODING (NACE Letters)

# Collapsing the 16 indicators into standardized NACE sector columns (sector_A to sector_O)
df['sector_A'] = df[['agriculture', 'forestry', 'fishing']].max(axis=1)
df['sector_B'] = df['extractive']
df['sector_C'] = df['manufacturing']
df['sector_D'] = df['energy_supply']
df['sector_E'] = df[['enviromental_management', 'water_management']].max(axis=1)
df['sector_F'] = df['construction']
df['sector_G'] = df['trade']
df['sector_H'] = df['transportation_logistics']
df['sector_I'] = 0
df['sector_J'] = df['media']
df['sector_K'] = df['finance_insurance']
df['sector_L'] = 0
df['sector_M'] = 0
df['sector_N'] = 0
df['sector_O'] = df[['public', 'education', 'informal']].max(axis=1)

# Removing original indicator columns to keep only the unified/NACE columns
old_cols = ['agriculture', 'construction', 'education', 'energy_supply', 
            'enviromental_management', 'extractive', 'finance_insurance', 
            'fishing', 'forestry', 'informal', 'manufacturing', 'media', 
            'public', 'trade', 'transportation_logistics', 'water_management']
df = df.drop(columns=old_cols)

# 4. FINAL EXPORT

df.to_csv('climate_litigation_final_unified.csv', index=False)

print(f"Pipeline complete. Dimensions: {df.shape}")
print("Final file saved with 'sectors_unified' and standardized NACE binaries.")

Pipeline complete. Dimensions: (1543, 21)
Final file saved with 'sectors_unified' and standardized NACE binaries.
