In [None]:
import pandas as pd
from datetime import datetime
import pycountry
import json
import os

In [None]:
df = pd.read_excel('data/emdat-data.xlsx')

def calculate_duration(row):
    try:
        start_year = row['Start Year']
        start_month = row['Start Month'] if pd.notna(row['Start Month']) else 1
        start_day = row['Start Day'] if pd.notna(row['Start Day']) else 1
        
        end_year = row['End Year']
        end_month = row['End Month'] if pd.notna(row['End Month']) else 12
        end_day = row['End Day'] if pd.notna(row['End Day']) else 31
        
        # Si alguno de los años no está disponible, retornar None
        if pd.isna(start_year) or pd.isna(end_year):
            return None
        
        # Crear objetos datetime
        start_date = datetime(int(start_year), int(start_month), int(start_day))
        end_date = datetime(int(end_year), int(end_month), int(end_day))
        
        duration = (end_date - start_date).days
        
        return duration if duration >= 0 else 0
    except:
        return None
    
def create_date_column(row, prefix):
    try:
        year = row[f'{prefix} Year']
        month = row[f'{prefix} Month'] if pd.notna(row[f'{prefix} Month']) else 1
        day = row[f'{prefix} Day'] if pd.notna(row[f'{prefix} Day']) else 1
        
        if pd.isna(year):
            return pd.NaT
        
        return pd.to_datetime(f"{int(year)}-{int(month):02d}-{int(day):02d}", errors='coerce')
    except:
        return pd.NaT

def iso_to_country_name(iso_code):

    if pd.isna(iso_code):
        return None
    
    try:
        country = pycountry.countries.get(alpha_3=iso_code)
        return country.name if country else None
    except:
        return None


df['Start Date'] = df.apply(lambda row: create_date_column(row, 'Start'), axis=1)
df['End Date'] = df.apply(lambda row: create_date_column(row, 'End'), axis=1)

df['Duration'] = df.apply(calculate_duration, axis=1)


df['Country Name'] = df['ISO'].apply(iso_to_country_name)


print(f"Country Name creado - Valores no nulos: {df['Country Name'].notna().sum()} de {len(df)}")
print("\nComparación ISO vs Country Name:")
print(df[['ISO', 'Country', 'Country Name']].head(20))


columns_to_drop = ['DisNo.', 
                'Historic', 
                'Classification Key', 
                'Disaster Group', 
                'External IDs', 
                'Admin Units', 
                'Latituede', 
                'Longitude', 
                'Latitude', 
                'Associated Types', 
                'GADM Admin Units', 
                'OFDA/BHA Response', 
                'Appeal', 
                'CPI', 
                'Declaration', 
                'River Basin', 
                "Total Damage, Adjusted ('000 US$)", 
                "Insured Damage, Adjusted ('000 US$)", 
                "Reconstruction Costs, Adjusted ('000 US$)", 
                "Disaster Subtype", "Origin", "Location", 
                "Reconstruction Costs ('000 US$)", 
                "AID Contribution ('000 US$)", 
                "Insured Damage ('000 US$)",
                "AID Contribution ('000 US$)", 
                "Reconstruction Costs ('000 US$)",
                "Insured Damage ('000 US$)", 
                "Start Month", 
                "Start Day", 
                "End Year", 
                "End Month", 
                "End Day",
                "Country",
                "Duration",
                "Entry Date",
                "Last Update",
                "Total Damage ('000 US$)",
                "No. Homeless",
                "No. Affected",
                "No. Injured",
                "Magnitude Scale",
                "Magnitude"
            ]


existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

if existing_columns_to_drop:
    df = df.drop(columns=existing_columns_to_drop)
    print(f"Columnas eliminadas: {existing_columns_to_drop}")
else:
    print("No se encontraron columnas para eliminar")

mapping = {
    "Mass movement (dry)": "Mass Movement",
    "Mass movement (wet)": "Mass Movement",
    "Glacial lake outburst flood": "Glacial Flood",
    "Animal incident": "Animal Incident",
    "Extreme temperature": "Extreme Temperature",
    "Volcanic activity": "Volcanic Activity"
}

df["Disaster Type"] = df["Disaster Type"].replace(mapping)

df.to_csv('data/emdat-data-processed.csv', index=False)

In [9]:
study_data = pd.read_csv('data/emdat-data-processed.csv')

In [None]:
def load_study_data(file_path):
    try:
        if file_path.endswith('.csv'):
            data = pd.read_csv(file_path)
        else:
            data = pd.read_excel(file_path)
        return data
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
        return None
    except pd.errors.ParserError:
        print("Error: There was a parsing error while reading the file.")
        return None
    
def analyze_missing_values(data):
    if data is None:
        return None
    
    missing_stats = pd.DataFrame({
        'Column': data.columns,
        'Missing_Count': data.isnull().sum().values,
        'Missing_Percentage': (data.isnull().sum().values / len(data) * 100).round(2),
        'Data_Type': data.dtypes.values
    })
    
    missing_stats = missing_stats.sort_values('Missing_Count', ascending=False)
    
    return missing_stats


def get_basic_statistics(data):
    if data is None:
        return None
    
    return data.describe()


def analyze_data_types(data):
    if data is None:
        return None
    
    dtype_counts = data.dtypes.value_counts()
    return dtype_counts


def get_unique_values_summary(data, max_unique=50):
    
    if data is None:
        return None
    
    unique_summary = pd.DataFrame({
        'Column': data.columns,
        'Unique_Count': [data[col].nunique() for col in data.columns],
        'Sample_Values': [str(data[col].dropna().unique()[:3].tolist()) if data[col].nunique() <= max_unique 
                         else f"{data[col].nunique()} unique values" 
                         for col in data.columns]
    })
    
    return unique_summary


def summarize_study_data(data):

    if data is None:
        return None

    summary = {
        'num_rows': data.shape[0],
        'num_columns': data.shape[1],
        'column_names': data.columns.tolist(),
        'total_missing': data.isnull().sum().sum(),
        'memory_usage': data.memory_usage(deep=True).sum() / 1024**2  # En MB
    }
    return summary

    
if study_data is not None:
    print("\n" + "="*80)
    print("1. RESUMEN GENERAL")
    print("="*80)
    summary = summarize_study_data(study_data)
    print(f"Número de filas: {summary['num_rows']:,}")
    print(f"Número de columnas: {summary['num_columns']}")
    print(f"Total de valores faltantes: {summary['total_missing']:,}")
    print(f"Uso de memoria: {summary['memory_usage']:.2f} MB")
    
    print("\n" + "="*80)
    print("2. ANÁLISIS DE VALORES FALTANTES (NaN)")
    print("="*80)
    missing_analysis = analyze_missing_values(study_data)
    print("\nColumnas con valores faltantes:")
    print(missing_analysis[missing_analysis['Missing_Count'] > 0].to_string(index=False))
        
    print("\n" + "="*80)
    print("3. DISTRIBUCIÓN DE TIPOS DE DATOS")
    print("="*80)
    dtype_analysis = analyze_data_types(study_data)
    print(dtype_analysis)

    print("\n" + "="*80)
    print("4. ESTADÍSTICAS DESCRIPTIVAS (Columnas Numéricas)")
    print("="*80)
    stats = get_basic_statistics(study_data)
    print(stats)
        
    print("\n" + "="*80)
    print("5. RESUMEN DE VALORES ÚNICOS")
    print("="*80)
    unique_summary = get_unique_values_summary(study_data)
    print(unique_summary.to_string(index=False))
        
    print("\n" + "="*80)
    print("6. PRIMERAS 5 FILAS DEL DATASET")
    print("="*80)
    print(study_data.head())
        
    print("\n" + "="*80)
    print("7. INFORMACIÓN DETALLADA DE COLUMNAS")
    print("="*80)
    study_data.info()
else:
    print("No se pudo cargar el dataset.")

In [None]:
print(study_data["Disaster Type"].unique())

In [None]:


country_name_mapping = {
    "Congo, The Democratic Republic of the": "Democratic Republic of the Congo",
    "Iran, Islamic Republic of": "Iran",
    "Tanzania, United Republic of": "Tanzania",
    "Korea, Democratic People's Republic of": "North Korea",
    "Korea, Republic of": "South Korea",
    "Lao People's Democratic Republic": "Laos",
    "Venezuela, Bolivarian Republic of": "Venezuela",
    "Moldova, Republic of": "Moldova",
    "Russian Federation": "Russia",
    "Syrian Arab Republic": "Syria",
    "Viet Nam": "Vietnam",
    "Bolivia, Plurinational State of": "Bolivia",
    "Macedonia, The Former Yugoslav Republic of": "North Macedonia",
    "Czechia": "Czech Republic",
    "Cabo Verde": "Cape Verde",
    "Cote d'Ivoire": "Ivory Coast",
    "Côte d'Ivoire": "Ivory Coast",
    "Türkiye": "Turkey",
    "Brunei Darussalam": "Brunei",
    "Micronesia, Federated States of": "Micronesia",
    "Saint Vincent and the Grenadines": "St. Vincent and the Grenadines",
    "Saint Lucia": "St. Lucia",
    "Saint Kitts and Nevis": "St. Kitts and Nevis",
    "Sao Tome and Principe": "São Tomé and Príncipe",
    "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
    "United States of America": "United States"
}

study_data['Country Name'] = study_data['Country Name'].replace(country_name_mapping)

study_data.to_csv('data/emdat-data-processed.csv', index=False)
print(study_data['Country Name'].unique()[:20])

In [None]:

north_america_subregions = ['Northern America']
south_america_subregions = ['Latin America and the Caribbean']


def reclassify_americas(row):
    if row['Region'] == 'Americas':
        if row['Subregion'] in north_america_subregions:
            return 'North America'
        elif row['Subregion'] in south_america_subregions:
            return 'South America'
    return row['Region']

study_data['Region'] = study_data.apply(reclassify_americas, axis=1)

print(study_data['Region'].unique())


print(study_data['Region'].value_counts())



print(study_data[study_data['Region'].isin(['North America', 'South America'])][['Region', 'Subregion']].drop_duplicates())
study_data = study_data.drop(columns=['Subregion'])

study_data.to_csv('data/emdat-data-processed.csv', index=False)


In [None]:

disasters_by_country = study_data.groupby(['ISO', 'Country Name', 'Region', 'Disaster Type', 'Disaster Subgroup']).size().reset_index(name='Disaster_Count')

disasters_by_country = disasters_by_country.sort_values('Disaster_Count', ascending=False)


disasters_by_country.to_csv('disasters_by_country.csv', index=False)


print(f"\n{disasters_by_country.head(20)}")

print(disasters_by_country.groupby('Region')['Disaster_Count'].sum().sort_values(ascending=False))

In [None]:

study_data = pd.read_csv('data/disasters_by_country.csv')


geojson_files = [
    'data/geometria/africa.geojson',
    'data/geometria/asia.geojson',
    'data/geometria/europe.geojson',
    'data/geometria/north_america.geojson',
    'data/geometria/oceania.geojson',
    'data/geometria/south_america.geojson'
]

country_mapping = {}

for geojson_file in geojson_files:
    if os.path.exists(geojson_file):
        with open(geojson_file, 'r', encoding='utf-8') as f:
            geojson = json.load(f)
            
        for feature in geojson['features']:
            props = feature['properties']
            iso_a3 = props.get('iso_a3')
            standard_name = props.get('name')
            
            if iso_a3 and standard_name:
                country_mapping[iso_a3] = standard_name



for iso_code, standard_name in country_mapping.items():
    study_data.loc[study_data['ISO'] == iso_code, 'Country Name'] = standard_name


study_data.to_csv('data/disasters_by_country.csv', index=False)

unique_countries = study_data[['ISO', 'Country Name', 'Region']].drop_duplicates().sort_values('Country Name')
print(unique_countries.head(30).to_string(index=False))


unmapped = study_data[~study_data['ISO'].isin(country_mapping.keys())]['Country Name'].unique()
if len(unmapped) > 0:
    print(unmapped[:10])

In [None]:
geojson_files = [
    'data/geometria/africa.geojson',
    'data/geometria/asia.geojson',
    'data/geometria/europe.geojson',
    'data/geometria/north_america.geojson',
    'data/geometria/oceania.geojson',
    'data/geometria/south_america.geojson'
]

country_mapping = {}

for geojson_file in geojson_files:
    if os.path.exists(geojson_file):
        with open(geojson_file, 'r', encoding='utf-8') as f:
            geojson = json.load(f)
            
        for feature in geojson['features']:
            props = feature['properties']
            iso_a3 = props.get('iso_a3')
            standard_name = props.get('name')
            
            if iso_a3 and standard_name:
                country_mapping[iso_a3] = standard_name

manual_mapping = {
    'BRA': 'Brazil',
    'COL': 'Colombia',
    'PER': 'Peru',
    'BOL': 'Bolivia',
    'ARG': 'Argentina',
    'VEN': 'Venezuela',
    'ECU': 'Ecuador',
    'CHL': 'Chile',
    'URY': 'Uruguay',
    'PRY': 'Paraguay',
    'HKG': 'Hong Kong',
    'REU': 'Réunion',
    'TON': 'Tonga',
    'VCT': 'St. Vin. and Gren.',
    'GUY': 'Guyana',
    'BRB': 'Barbados',
    'CYM': 'Cayman Is.',
    'MUS': 'Mauritius',
    'FSM': 'Micronesia',
    'COM': 'Comoros',
    'GLP': 'Guadeloupe',
    'MTQ': 'Martinique',
    'MYT': 'Mayotte',
    'NCL': 'New Caledonia',
    'PYF': 'Fr. Polynesia',
    'SXM': 'Sint Maarten',
    'MAF': 'St-Martin',
    'BLM': 'St-Barthélemy',
    'VGB': 'British Virgin Is.',
    'VIR': 'U.S. Virgin Is.',
    'ASM': 'American Samoa',
    'GUM': 'Guam',
    'MNP': 'N. Mariana Is.',
    'PRI': 'Puerto Rico',
    'BMU': 'Bermuda',
    'MSR': 'Montserrat',
    'TCA': 'Turks and Caicos Is.',
    'AIA': 'Anguilla',
    'GUF': 'French Guiana',
    'SUR': 'Suriname',
    'NIU': 'Niue',
    'COK': 'Cook Is.',
    'WLF': 'Wallis and Futuna',
    'TKL': 'Tokelau',
    'PLW': 'Palau',
    'STP': 'São Tomé and Principe',
    'CPV': 'Cape Verde',
    'SHN': 'St. Helena',
    'MAC': 'Macao',
    'SGP': 'Singapore',
    'LIE': 'Liechtenstein',
    'QAT': 'Qatar',
    'KWT': 'Kuwait',
    'BHR': 'Bahrain',
    'PSE': 'Palestine',
    'TWN': 'Taiwan',
    'PRK': 'North Korea',
    'KOR': 'South Korea',
    'LCA': 'St. Lucia',
    'DMA': 'Dominica',
    'WSM': 'Samoa',
    'MHL': 'Marshall Is.',
    'GRD': 'Grenada',
    'ATG': 'Antigua and Barb.',
    'SYC': 'Seychelles',
    'TUV': 'Tuvalu',
    'MDV': 'Maldives',
    'KIR': 'Kiribati',
    'KNA': 'St. Kitts and Nevis',
    'MLT': 'Malta'
}
country_mapping.update(manual_mapping)

df_emdat = pd.read_csv('data/emdat-data-processed.csv')

for iso_code, standard_name in country_mapping.items():
    df_emdat.loc[df_emdat['ISO'] == iso_code, 'Country Name'] = standard_name

df_emdat.to_csv('data/emdat-data-processed.csv', index=False)

df_disasters = pd.read_csv('data/disasters_by_country.csv')

for iso_code, standard_name in country_mapping.items():
    df_disasters.loc[df_disasters['ISO'] == iso_code, 'Country Name'] = standard_name

df_disasters.to_csv('data/disasters_by_country.csv', index=False)

unmapped = df_disasters[~df_disasters['ISO'].isin(country_mapping.keys())][['ISO', 'Country Name']].drop_duplicates()
if len(unmapped) > 0:
    print(f"{len(unmapped)}")
    print(unmapped.to_string(index=False))
else:
    print("OK")


print(df_disasters[['ISO', 'Country Name', 'Region']].drop_duplicates().sort_values('Country Name').head(30).to_string(index=False))


In [None]:

df = pd.read_csv('data/disasters_by_country.csv')

df = df.rename(columns={
    'Country Name': 'Country',
    'Disaster Subgroup': 'disaster',
    'Disaster_Count': 'disaster_count'
})

df['color_intensity'] = df['disaster_count']

regions = df['Region'].unique()

for region in regions:
    region_df = df[df['Region'] == region][['Country', 'disaster', 'disaster_count', 'color_intensity']]
    
    filename = f"data/disasters_{region.lower().replace(' ', '_')}.csv"
    
    region_df.to_csv(filename, index=False)



In [None]:
df_world = pd.read_csv('data/disasters_by_country.csv')

df_world = df_disasters.rename(columns={
    'Country Name': 'Country',
    'Disaster Subgroup': 'disaster',
    'Disaster_Count': 'disaster_count'
})

df_world['color_intensity'] = df['disaster_count']

df_world = df[['Country', 'disaster', 'disaster_count', 'color_intensity']]

df_world.to_csv('data/disasters_world.csv', index=False)

In [None]:



study_data = study_data[['Disaster Subgroup', 'Disaster Type', 'Event Name', 'Total Deaths', 'Total Affected', 'Start Date', 'End Date', 'Country Name', 'Region']].copy()

study_data = study_data.rename(columns={
    'Disaster Subgroup': 'disaster',
    'Disaster Type': 'disaster_type',
    'Event Name': 'event_name',
    'Total Deaths': 'deaths',
    'Total Affected': 'affected',
    'Start Date': 'start',
    'End Date': 'end',
    'Country Name': 'Country'
})

regions = study_data['Region'].unique()

for region in regions:
    region_df = study_data[study_data['Region'] == region].drop(columns=['Region'])
    
    filename = f"data/{region.lower().replace(' ', '_')}_disaster_info.csv"

    region_df.to_csv(filename, index=False)