In [1]:
import pandas as pd
import os
from pathlib import Path



In [None]:
# Configuration
input_folder = "database_downloads"  # Relative path to the database_downloads folder
output_file = "combined_climate_data.csv"  # Output file name

# Columns to keep
columns_to_keep = [
    'Bundle ID(s)',  # Needed for deduplication
    'Case Filing Year for Action',
    'Status',
    'Case Categories',
    'Geographies'
    # Add 'sector' column name here if you identify it
]

# Initialize list to store dataframes
all_data = []

# Process each CSV file in the folder
for csv_file in Path(input_folder).glob("*.csv"):
    print(f"Processing: {csv_file.name}")
    
    # Read the CSV
    df = pd.read_csv(csv_file)
    
    # Add a category column based on the filename (without extension)
    category_name = csv_file.stem  # stem gives filename without extension
    df['Category'] = category_name
    
    # Keep only the columns we need
    df = df[columns_to_keep + ['Category']]
    
    # Add to our list
    all_data.append(df)
    print(f"  Loaded {len(df)} rows from {csv_file.name}")

# Combine all dataframes
combined_df = pd.concat(all_data, ignore_index=True)
print(f"\nTotal rows before processing: {len(combined_df)}")

# Create a binary indicator for each category
combined_df['value'] = 1

# Pivot to create category columns
category_pivot = combined_df.pivot_table(
    index='Bundle ID(s)',
    columns='Category',
    values='value',
    aggfunc='max',
    fill_value=0
).reset_index()

# Get the other columns (take first occurrence for each Bundle ID)
other_columns = combined_df.drop(columns=['Category', 'value']).drop_duplicates(subset='Bundle ID(s)', keep='first')

# Merge everything together
final_df = other_columns.merge(category_pivot, on='Bundle ID(s)')

print(f"Total unique cases: {len(final_df)}")

# Save the result
final_df.to_csv(output_file, index=False)
print(f"\nSaved to: {output_file}")

# Show which cases appear in multiple categories
category_columns = [col for col in final_df.columns if col not in columns_to_keep]
final_df['total_categories'] = final_df[category_columns].sum(axis=1)

print("\nCases appearing in multiple categories:")
print(final_df[final_df['total_categories'] > 1]['total_categories'].value_counts().sort_index())

print("\nCategory column summary:")
for col in category_columns:
    print(f"  {col}: {final_df[col].sum()} cases")

In [None]:
# Geography Statistics
print("\n" + "="*50)
print("GEOGRAPHY STATISTICS")
print("="*50)

# Count total cases
total_cases = len(final_df)
print(f"\nTotal cases: {total_cases}")

# Analyze Geographies column
# Note: Each row might contain multiple geographies separated by delimiters
geography_counts = {}

for idx, row in final_df.iterrows():
    geographies = str(row['Geographies'])
    
    # Skip if NaN or empty
    if pd.isna(row['Geographies']) or geographies == 'nan':
        geography_counts['Unknown/Not Specified'] = geography_counts.get('Unknown/Not Specified', 0) + 1
    else:
        # Split by common delimiters (adjust based on your data format)
        # Common delimiters: semicolon, comma, pipe
        import re
        geo_list = re.split(r'[;,|]', geographies)
        
        for geo in geo_list:
            geo = geo.strip()
            if geo:
                geography_counts[geo] = geography_counts.get(geo, 0) + 1

# Convert to DataFrame for better display
geography_stats = pd.DataFrame([
    {'Geography': geo, 'Count': count, 'Percentage': (count/total_cases)*100}
    for geo, count in sorted(geography_counts.items(), key=lambda x: x[1], reverse=True)
])

print("\nGeography Distribution:")
print(geography_stats.to_string(index=False))

print(f"\nTotal unique geographies: {len(geography_counts)}")

# Optional: Create a simple bar chart
import matplotlib.pyplot as plt

# Show top 15 geographies
top_n = 15
top_geographies = geography_stats.head(top_n)

plt.figure(figsize=(12, 6))
plt.barh(range(len(top_geographies)), top_geographies['Count'])
plt.yticks(range(len(top_geographies)), top_geographies['Geography'])
plt.xlabel('Number of Cases')
plt.ylabel('Geography')
plt.title(f'Top {top_n} Geographies by Case Count')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Save statistics to CSV
geography_stats.to_csv('geography_statistics.csv', index=False)
print("\nGeography statistics saved to: geography_statistics.csv")

In [3]:
import pandas as pd

# Load  CSV
df = pd.read_csv('combined_climate_data.csv')

# Show top 10 most common values in the 'Geographies' column
print(df['Geographies'].value_counts().head(20))


Geographies
USA;US-DC    253
USA;US-CA    135
BRA;BR-PA     73
BRA           72
USA;US-NY     65
XAB           58
USA           53
GBR           48
USA;US-AK     41
BRA;BR-AM     40
USA;US-WA     36
USA;US-OR     36
NZL           27
USA;US-MT     24
AUS           23
USA;US-MA     23
USA;US-TX     22
USA;US-LA     20
USA;US-MD     19
USA;US-AZ     17
Name: count, dtype: int64


the most listed 