In [1]:

# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames
pd.set_option('display.max_rows', None)

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")




In [2]:
# cargamos el dataframe correspondiente 
df = pd.read_csv("archivos/natural_disasters.csv", delimiter=';')

display(df.head())
display(df.tail())

Unnamed: 0,Year,Country,ISO,Disaster Group,Disaster Subroup,Disaster Type,Disaster Subtype,Total Events,Total Affected,Total Deaths,"Total Damage (USD, original)","Total Damage (USD, adjusted)",CPI
0,1900,Cabo Verde,CPV,Natural,Climatological,Drought,Drought,1,,11000.0,,,28490844088613
1,1900,India,IND,Natural,Climatological,Drought,Drought,1,,1250000.0,,,28490844088613
2,1900,Jamaica,JAM,Natural,Hydrological,Flood,,1,,300.0,,,28490844088613
3,1900,Japan,JPN,Natural,Geophysical,Volcanic activity,Ash fall,1,,30.0,,,28490844088613
4,1900,Turkey,TUR,Natural,Geophysical,Earthquake,Ground movement,1,,140.0,,,28490844088613


Unnamed: 0,Year,Country,ISO,Disaster Group,Disaster Subroup,Disaster Type,Disaster Subtype,Total Events,Total Affected,Total Deaths,"Total Damage (USD, original)","Total Damage (USD, adjusted)",CPI
10426,2023,Vanuatu,VUT,Natural,Meteorological,Storm,Tropical cyclone,2,502702.0,,,,
10427,2023,South Africa,ZAF,Natural,Hydrological,Flood,,2,1500.0,20.0,,,
10428,2023,Zambia,ZMB,Natural,Hydrological,Flood,,1,22000.0,,,,
10429,2023,Zambia,ZMB,Natural,Hydrological,Flood,Flash flood,1,154608.0,,,,
10430,2023,Zimbabwe,ZWE,Natural,Meteorological,Storm,Tropical cyclone,1,,,,,


In [3]:
def exploracion(df):
    df_info = pd.DataFrame()
    df_info["% nulos"] = round(df.isna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["% no_nulos"] = round(df.notna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["tipo_dato"] = df.dtypes
    df_info["num_valores_unicos"] = df.nunique()
    print(f"""El DataFrame tiene {df.shape[0]} filas y {df.shape[1]} columnas.
Tiene {df.duplicated().sum()} datos duplicados, lo que supone un porcentaje de {round(df.duplicated().sum()/df.shape[0], 2)}% de los datos.
Hay {len(list(df_info[(df_info["% nulos"] != "0.0%")].index))} columnas con datos nulos, y son:
{list(df_info[(df_info["% nulos"] != "0.0%")].index)}
y sin nulos hay {len(list(df_info[(df_info["% nulos"] == "0.0%")].index))} columnas y son:
{list(df_info[(df_info["% nulos"] == "0.0%")].index)}
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:""")
    display(df_info.head())
    print("Principales estadísticos de las columnas categóricas:")
    display(df.describe(include="O").T)
    print("Principales estadísticos de las columnas numéricas:")
    display(df.describe(exclude="O").T)
    return df_info

exploracion(df)

El DataFrame tiene 10431 filas y 13 columnas.
Tiene 0 datos duplicados, lo que supone un porcentaje de 0.0% de los datos.
Hay 6 columnas con datos nulos, y son:
['Disaster Subtype', 'Total Affected', 'Total Deaths', 'Total Damage (USD, original)', 'Total Damage (USD, adjusted)', 'CPI']
y sin nulos hay 7 columnas y son:
['Year', 'Country', 'ISO', 'Disaster Group', 'Disaster Subroup', 'Disaster Type', 'Total Events']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Year,0.0%,100.0%,int64,124
Country,0.0%,100.0%,object,225
ISO,0.0%,100.0%,object,225
Disaster Group,0.0%,100.0%,object,1
Disaster Subroup,0.0%,100.0%,object,5


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
Country,10431,225,United States of America (the),405
ISO,10431,225,USA,405
Disaster Group,10431,1,Natural,10431
Disaster Subroup,10431,5,Hydrological,4489
Disaster Type,10431,13,Flood,3837
Disaster Subtype,8298,25,Riverine flood,1628
CPI,10380,114,614631882611914,295


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,10431.0,1995.61,22.00119,1900.0,1986.0,2001.0,2011.0,2023.0
Total Events,10431.0,1.446649,1.246589,1.0,1.0,1.0,1.0,20.0
Total Affected,7586.0,1125969.0,9760891.0,1.0,1200.0,11414.0,119304.5,330000000.0
Total Deaths,7375.0,3107.711,72555.89,1.0,6.0,23.0,90.0,3700000.0
"Total Damage (USD, original)",3834.0,1122262000.0,6792339000.0,2000.0,10000000.0,68000000.0,400000000.0,210000000000.0
"Total Damage (USD, adjusted)",3830.0,1748704000.0,9115319000.0,2469.0,20209265.5,146924694.0,784776702.0,273218400000.0


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Year,0.0%,100.0%,int64,124
Country,0.0%,100.0%,object,225
ISO,0.0%,100.0%,object,225
Disaster Group,0.0%,100.0%,object,1
Disaster Subroup,0.0%,100.0%,object,5
Disaster Type,0.0%,100.0%,object,13
Disaster Subtype,20.45%,79.55%,object,25
Total Events,0.0%,100.0%,int64,19
Total Affected,27.27%,72.73%,float64,4043
Total Deaths,29.3%,70.7%,float64,815


# Valores unicos de las variables categoricas ✨

In [4]:
# creamos una lista con los nombres de las columnas categoricas 
columnas = df.select_dtypes(include='object').columns.tolist()
print(columnas)
# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas:
    print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
    print(f"Sus valores únicos son: {df[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de las categorías son: {df[columna].value_counts()} ")

['Country', 'ISO', 'Disaster Group', 'Disaster Subroup', 'Disaster Type', 'Disaster Subtype', 'CPI']
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'COUNTRY' -----------

Sus valores únicos son: ['Cabo Verde' 'India' 'Jamaica' 'Japan' 'Turkey'
 'United States of America (the)' 'China' 'Guatemala' 'Myanmar'
 'Martinique' 'Soviet Union' 'Saint Vincent and the Grenadines' 'Canada'
 'Comoros (the)' 'Iran (Islamic Republic of)' 'Israel' 'Niger (the)'
 'Bangladesh' 'Greece' 'Taiwan (Province of China)' 'Albania' 'Italy'
 'Philippines (the)' 'Belgium' 'Chile' 'Colombia' 'Hong Kong' 'Romania'
 'France' 'Haiti' 'Morocco' 'Pakistan' 'Portugal' 'Burkina Faso'
 'Costa Rica' 'Algeria' 'Gambia (the)' 'Guinea-Bissau' 'Mali' 'Mauritania'
 'Senegal' 'Chad' 'Kazakhstan' 'Mexico' 'Indonesia' 'Peru' 'Tokelau'
 'Puerto Rico' 'Anguilla' 'Argentina' 'Germany Fed Rep' 'Ecuador'
 'Bahamas (the)' 'Cuba' 'Egypt' 'Jordan' 'Bulgaria' 'Guadeloupe'
 'Saint Kitts and Nevis' 'Montserrat' 'Poland' 'New Zealand' 'Dominica

# Creamos columna CONTINENTE

In [5]:
country_to_continent = {
    # Africa
    'Cabo Verde': 'Africa', 'Comoros (the)': 'Africa', 'Algeria': 'Africa', 'Gambia (the)': 'Africa',
    'Guinea-Bissau': 'Africa', 'Mali': 'Africa', 'Mauritania': 'Africa', 'Senegal': 'Africa',
    'Chad': 'Africa', 'Tunisia': 'Africa', 'Ethiopia': 'Africa', 'Somalia': 'Africa',
    'Kenya': 'Africa', 'Tanzania, United Republic of': 'Africa', 'Botswana': 'Africa',
    'Congo (the Democratic Republic of the)': 'Africa', 'Malawi': 'Africa', 'Nigeria': 'Africa',
    'South Africa': 'Africa', 'Sudan (the)': 'Africa', 'Zimbabwe': 'Africa', 'Angola': 'Africa',
    'Namibia': 'Africa', 'Eswatini': 'Africa', 'Liberia': 'Africa', 'Seychelles': 'Africa',
    'Libya': 'Africa', 'Mozambique': 'Africa', 'Madagascar': 'Africa', 'Togo': 'Africa',
    'Uganda': 'Africa', 'Benin': 'Africa', 'Cameroon': 'Africa', 'Burundi': 'Africa',
    'Rwanda': 'Africa', 'Djibouti': 'Africa', 'Zambia': 'Africa', 'Guinea': 'Africa',
    'Côte d’Ivoire': 'Africa', 'Gabon': 'Africa', 'Eritrea': 'Africa',
    'Sao Tome and Principe': 'Africa', 'Central African Republic': 'Africa',
    
    # Americas
    'United States of America (the)': 'Americas', 'Canada': 'Americas', 'Mexico': 'Americas',
    'Brazil': 'Americas', 'Argentina': 'Americas', 'Chile': 'Americas', 'Colombia': 'Americas',
    'Peru': 'Americas', 'Venezuela (Bolivarian Republic of)': 'Americas', 'Ecuador': 'Americas',
    'Costa Rica': 'Americas', 'Guatemala': 'Americas', 'Puerto Rico': 'Americas',
    'Cuba': 'Americas', 'Bahamas (the)': 'Americas', 'Haiti': 'Americas', 
    'Dominican Republic (the)': 'Americas', 'Belize': 'Americas', 'Honduras': 'Americas',
    'El Salvador': 'Americas', 'Panama': 'Americas', 'Jamaica': 'Americas', 
    'Barbados': 'Americas', 'Trinidad and Tobago': 'Americas', 
    'Grenada': 'Americas', 'Saint Vincent and the Grenadines': 'Americas', 
    'Saint Kitts and Nevis': 'Americas', 'Antigua and Barbuda': 'Americas', 
    'Paraguay': 'Americas', 'Bolivia (Plurinational State of)': 'Americas', 
    'Guyana': 'Americas', 'Suriname': 'Americas',
    
    # Asia
    'China': 'Asia', 'India': 'Asia', 'Japan': 'Asia', 'Pakistan': 'Asia', 'Bangladesh': 'Asia',
    'Korea (the Republic of)': 'Asia', 'Indonesia': 'Asia', 'Malaysia': 'Asia',
    'Philippines (the)': 'Asia', 'Vietnam': 'Asia', 'Thailand': 'Asia', 'Afghanistan': 'Asia',
    'Myanmar': 'Asia', 'Kazakhstan': 'Asia', 'Armenia': 'Asia', 'Lebanon': 'Asia',
    'Sri Lanka': 'Asia', "Lao People's Democratic Republic (the)": 'Asia', 
    'Cambodia': 'Asia', 'Nepal': 'Asia', 'Georgia': 'Asia', 'Mongolia': 'Asia', 
    'Bhutan': 'Asia', 'Azerbaijan': 'Asia', 'Uzbekistan': 'Asia', 'Tajikistan': 'Asia',
    'Kyrgyzstan': 'Asia', 'Turkmenistan': 'Asia', 'Maldives': 'Asia', 
    'Brunei Darussalam': 'Asia', 'Timor-Leste': 'Asia',
    
    # Europe
    'France': 'Europe', 'Germany': 'Europe', 'Italy': 'Europe', 'United Kingdom of Great Britain and Northern Ireland (the)': 'Europe',
    'Spain': 'Europe', 'Portugal': 'Europe', 'Belgium': 'Europe', 'Netherlands (the)': 'Europe',
    'Ireland': 'Europe', 'Luxembourg': 'Europe', 'Switzerland': 'Europe', 'Austria': 'Europe',
    'Denmark': 'Europe', 'Norway': 'Europe', 'Sweden': 'Europe', 'Finland': 'Europe',
    'Greece': 'Europe', 'Romania': 'Europe', 'Albania': 'Europe', 'Hungary': 'Europe',
    'Poland': 'Europe', 'Iceland': 'Europe', 'Czech Republic (the)': 'Europe',
    'Slovakia': 'Europe', 'Latvia': 'Europe', 'Lithuania': 'Europe', 'Slovenia': 'Europe',
    'Croatia': 'Europe', 'Bosnia and Herzegovina': 'Europe', 'Serbia': 'Europe',
    'Montenegro': 'Europe', 'Bulgaria': 'Europe', 'Estonia': 'Europe',
    
    # Oceania
    'Australia': 'Oceania', 'New Zealand': 'Oceania', 'Fiji': 'Oceania', 'Vanuatu': 'Oceania',
    'Papua New Guinea': 'Oceania', 'Tonga': 'Oceania', 'Samoa': 'Oceania', 'Kiribati': 'Oceania',
    'Solomon Islands': 'Oceania', 'Micronesia (Federated States of)': 'Oceania',
    'Marshall Islands (the)': 'Oceania', 'Tuvalu': 'Oceania', 'Palau': 'Oceania',
    'American Samoa': 'Oceania', 'Cook Islands (the)': 'Oceania', 'Niue': 'Oceania',
    'Tokelau': 'Oceania', 'Wallis and Futuna': 'Oceania',
    
    # Middle East
    'Iran (Islamic Republic of)': 'Middle East', 'Israel': 'Middle East', 'Saudi Arabia': 'Middle East',
    'Jordan': 'Middle East', 'Turkey': 'Middle East', 'Cyprus': 'Middle East', 'Iraq': 'Middle East',
    'Kuwait': 'Middle East', 'United Arab Emirates (the)': 'Middle East', 'Qatar': 'Middle East',
    'Palestine, State of': 'Middle East', 'Yemen': 'Middle East', 'Oman': 'Middle East',
    'Syrian Arab Republic': 'Middle East',
    
    # Unknown or regions
    'Soviet Union': 'Unknown', 'Yugoslavia': 'Unknown', 'German Fed Rep': 'Unknown',
    'Azores Islands': 'Unknown', 'French Polynesia': 'Unknown',
    'Réunion': 'Unknown', 'Canary Is': 'Unknown',
    'Saint Martin (French Part)': 'Unknown', 'Sint Maarten (Dutch part)': 'Unknown'
}


In [6]:

# Crear nueva columna 'continent' basada en 'Country'
df['continent'] = df['Country'].map(country_to_continent)

# Mostrar un ejemplo
display(df[['Country', 'continent']].head())


Unnamed: 0,Country,continent
0,Cabo Verde,Africa
1,India,Asia
2,Jamaica,Americas
3,Japan,Asia
4,Turkey,Middle East


In [7]:
df.head(10)

Unnamed: 0,Year,Country,ISO,Disaster Group,Disaster Subroup,Disaster Type,Disaster Subtype,Total Events,Total Affected,Total Deaths,"Total Damage (USD, original)","Total Damage (USD, adjusted)",CPI,continent
0,1900,Cabo Verde,CPV,Natural,Climatological,Drought,Drought,1,,11000.0,,,28490844088613,Africa
1,1900,India,IND,Natural,Climatological,Drought,Drought,1,,1250000.0,,,28490844088613,Asia
2,1900,Jamaica,JAM,Natural,Hydrological,Flood,,1,,300.0,,,28490844088613,Americas
3,1900,Japan,JPN,Natural,Geophysical,Volcanic activity,Ash fall,1,,30.0,,,28490844088613,Asia
4,1900,Turkey,TUR,Natural,Geophysical,Earthquake,Ground movement,1,,140.0,,,28490844088613,Middle East
5,1900,United States of America (the),USA,Natural,Meteorological,Storm,Tropical cyclone,1,,6000.0,30000000.0,1052970000.0,28490844088613,Americas
6,1901,Japan,JPN,Natural,Geophysical,Earthquake,Tsunami,1,24.0,18.0,,,28490844088613,Asia
7,1902,China,CHN,Natural,Geophysical,Earthquake,Ground movement,1,,2500.0,,,29630477852157,Asia
8,1902,Guatemala,GTM,Natural,Geophysical,Earthquake,Ground movement,1,,2000.0,25000000.0,843725800.0,29630477852157,Americas
9,1902,Guatemala,GTM,Natural,Geophysical,Volcanic activity,Ash fall,2,,7000.0,,,29630477852157,Americas


# Vemos los nulos
- Nota: ya he actualizado la lista para que haya menos nulos ✨

In [8]:
# Filtrar filas donde el continente es nulo
rows_with_null_continent = df[df['continent'].isnull()]

# Mostrar las filas con valores nulos en 'continent'
display(rows_with_null_continent[['Country', 'continent']])

Unnamed: 0,Country,continent
12,Martinique,
20,Niger (the),
26,Taiwan (Province of China),
39,Hong Kong,
41,Niger (the),
42,Taiwan (Province of China),
57,Morocco,
60,Taiwan (Province of China),
63,Burkina Faso,
73,Niger (the),


In [9]:
# Valores unicos de paises nulos
print(f"Sus valores únicos son: {rows_with_null_continent['Country'].unique()}\n")


Sus valores únicos son: ['Martinique' 'Niger (the)' 'Taiwan (Province of China)' 'Hong Kong'
 'Morocco' 'Burkina Faso' 'Anguilla' 'Germany Fed Rep' 'Egypt'
 'Guadeloupe' 'Montserrat' 'Dominica' 'Nicaragua' 'Ghana' 'New Caledonia'
 'Yemen Arab Rep' 'Bermuda' 'Viet Nam' 'Netherlands Antilles'
 'Saint Lucia' 'Mauritius' 'Guam' 'Congo (the)' 'Uruguay' 'Lesotho'
 'Yemen P Dem Rep' 'Czechoslovakia' 'Sierra Leone' 'Germany Dem Rep'
 'Turks and Caicos Islands (the)'
 "Korea (the Democratic People's Republic of)" 'Virgin Island (U.S.)'
 'Russian Federation (the)' 'Serbia Montenegro' 'Ukraine' 'Belarus'
 'Macao' 'Macedonia (the former Yugoslav Republic of)'
 'Moldova (the Republic of)' 'French Guiana' 'Virgin Island (British)'
 'Cayman Islands (the)' 'Saint Helena, Ascension and Tristan da Cunha'
 'Northern Mariana Islands (the)' 'South Sudan' 'Saint Barthélemy'
 'Isle of Man']



In [10]:
dicc_pais_continente = {
    # Africa
    'Niger (the)': 'Africa',
    'Morocco': 'Africa',
    'Burkina Faso': 'Africa',
    'Egypt': 'Africa',
    'Ghana': 'Africa',
    'Mauritius': 'Africa',
    'Congo (the)': 'Africa',
    'Lesotho': 'Africa',
    'Sierra Leone': 'Africa',
    'Saint Helena, Ascension and Tristan da Cunha': 'Africa',
    'South Sudan': 'Africa',

    # Americas
    'Martinique': 'Americas',
    'Anguilla': 'Americas',
    'Guadeloupe': 'Americas',
    'Montserrat': 'Americas',
    'Dominica': 'Americas',
    'Nicaragua': 'Americas',
    'Bermuda': 'Americas',
    'Netherlands Antilles': 'Americas',
    'Saint Lucia': 'Americas',
    'Turks and Caicos Islands (the)': 'Americas',
    'Virgin Island (U.S.)': 'Americas',
    'Virgin Island (British)': 'Americas',
    'Cayman Islands (the)': 'Americas',
    'Saint Barthélemy': 'Americas',
    'Uruguay': 'Americas',
    'French Guiana': 'Americas',

    # Asia
    'Taiwan (Province of China)': 'Asia',
    'Hong Kong': 'Asia',
    'Yemen Arab Rep': 'Asia',
    'Yemen P Dem Rep': 'Asia',
    'Viet Nam': 'Asia',
    "Korea (the Democratic People's Republic of)": 'Asia',
    'Macao': 'Asia',

    # Europe
    'Germany Fed Rep': 'Europe',
    'Czechoslovakia': 'Europe',
    'Germany Dem Rep': 'Europe',
    'Serbia Montenegro': 'Europe',
    'Ukraine': 'Europe',
    'Belarus': 'Europe',
    'Macedonia (the former Yugoslav Republic of)': 'Europe',
    'Moldova (the Republic of)': 'Europe',
    'Isle of Man': 'Europe',

    # Oceania
    'New Caledonia': 'Oceania',
    'Guam': 'Oceania',
    'Northern Mariana Islands (the)': 'Oceania',

    # Middle East
    'Russian Federation (the)': 'Europe/Asia',  # Nota: Rusia está dividida entre Europa y Asia
}


In [11]:
nan_continent = df[df['continent'].isna()]
nan_continent

Unnamed: 0,Year,Country,ISO,Disaster Group,Disaster Subroup,Disaster Type,Disaster Subtype,Total Events,Total Affected,Total Deaths,"Total Damage (USD, original)","Total Damage (USD, adjusted)",CPI,continent
12,1902,Martinique,MTQ,Natural,Geophysical,Volcanic activity,Ash fall,1,,30000.0,,,29630477852157.0,
20,1903,Niger (the),NER,Natural,Climatological,Drought,Drought,1,,,,,30770111615701.0,
26,1904,Taiwan (Province of China),TWN,Natural,Geophysical,Earthquake,Ground movement,2,2349.0,148.0,,,30770111615701.0,
39,1906,Hong Kong,HKG,Natural,Meteorological,Storm,Tropical cyclone,1,,10000.0,20000000.0,649981400.0,30770111615701.0,
41,1906,Niger (the),NER,Natural,Climatological,Drought,Drought,1,,,,,30770111615701.0,
42,1906,Taiwan (Province of China),TWN,Natural,Geophysical,Earthquake,Ground movement,2,25918.0,1273.0,,,30770111615701.0,
57,1909,Morocco,MAR,Natural,Geophysical,Earthquake,Ground movement,1,,100.0,,,30770111615701.0,
60,1909,Taiwan (Province of China),TWN,Natural,Geophysical,Earthquake,Ground movement,1,417.0,9.0,,,30770111615701.0,
63,1910,Burkina Faso,BFA,Natural,Climatological,Drought,Drought,1,,,,,31909745379246.0,
73,1910,Niger (the),NER,Natural,Climatological,Drought,Drought,1,32000.0,85000.0,,,31909745379246.0,


In [12]:
#Sustituimos los nan de los países que quedan

df['continent'] = df['continent'].fillna(df['Country'].map(dicc_pais_continente))

In [13]:
nan_continent_after = df[df['continent'].isna()]
nan_continent_after

Unnamed: 0,Year,Country,ISO,Disaster Group,Disaster Subroup,Disaster Type,Disaster Subtype,Total Events,Total Affected,Total Deaths,"Total Damage (USD, original)","Total Damage (USD, adjusted)",CPI,continent


# Creamos columna DEVELOPMENT

In [14]:
# Sacamos los valores únicos de países para pasar al ChatGpt la lista y nos devuelva una clasificación
df['Country'].unique()

array(['Cabo Verde', 'India', 'Jamaica', 'Japan', 'Turkey',
       'United States of America (the)', 'China', 'Guatemala', 'Myanmar',
       'Martinique', 'Soviet Union', 'Saint Vincent and the Grenadines',
       'Canada', 'Comoros (the)', 'Iran (Islamic Republic of)', 'Israel',
       'Niger (the)', 'Bangladesh', 'Greece',
       'Taiwan (Province of China)', 'Albania', 'Italy',
       'Philippines (the)', 'Belgium', 'Chile', 'Colombia', 'Hong Kong',
       'Romania', 'France', 'Haiti', 'Morocco', 'Pakistan', 'Portugal',
       'Burkina Faso', 'Costa Rica', 'Algeria', 'Gambia (the)',
       'Guinea-Bissau', 'Mali', 'Mauritania', 'Senegal', 'Chad',
       'Kazakhstan', 'Mexico', 'Indonesia', 'Peru', 'Tokelau',
       'Puerto Rico', 'Anguilla', 'Argentina', 'Germany Fed Rep',
       'Ecuador', 'Bahamas (the)', 'Cuba', 'Egypt', 'Jordan', 'Bulgaria',
       'Guadeloupe', 'Saint Kitts and Nevis', 'Montserrat', 'Poland',
       'New Zealand', 'Dominica', 'Dominican Republic (the)', 'Nicara

In [15]:
# Esta es la clasificación que nos devuelve
country_development = {
    'Cabo Verde': 'Developing',
    'India': 'Developing',
    'Jamaica': 'Developing',
    'Japan': 'Developed',
    'Turkey': 'Developing',
    'United States of America (the)': 'Developed',
    'China': 'Developing',
    'Guatemala': 'Developing',
    'Myanmar': 'Developing',
    'Martinique': 'Developed',
    'Soviet Union': 'Historical',
    'Saint Vincent and the Grenadines': 'Developing',
    'Canada': 'Developed',
    'Comoros (the)': 'Underdeveloped',
    'Iran (Islamic Republic of)': 'Developing',
    'Israel': 'Developed',
    'Niger (the)': 'Underdeveloped',
    'Bangladesh': 'Developing',
    'Greece': 'Developed',
    'Taiwan (Province of China)': 'Developed',
    'Albania': 'Developing',
    'Italy': 'Developed',
    'Philippines (the)': 'Developing',
    'Belgium': 'Developed',
    'Chile': 'Developed',
    'Colombia': 'Developing',
    'Hong Kong': 'Developed',
    'Romania': 'Developing',
    'France': 'Developed',
    'Haiti': 'Underdeveloped',
    'Morocco': 'Developing',
    'Pakistan': 'Developing',
    'Portugal': 'Developed',
    'Burkina Faso': 'Underdeveloped',
    'Costa Rica': 'Developing',
    'Algeria': 'Developing',
    'Gambia (the)': 'Underdeveloped',
    'Guinea-Bissau': 'Underdeveloped',
    'Mali': 'Underdeveloped',
    'Mauritania': 'Underdeveloped',
    'Senegal': 'Underdeveloped',
    'Chad': 'Underdeveloped',
    'Kazakhstan': 'Developing',
    'Mexico': 'Developing',
    'Indonesia': 'Developing',
    'Peru': 'Developing',
    'Tokelau': 'Underdeveloped',
    'Puerto Rico': 'Developed',
    'Anguilla': 'Developing',
    'Argentina': 'Developing',
    'Germany Fed Rep': 'Developed',
    'Ecuador': 'Developing',
    'Bahamas (the)': 'Developed',
    'Cuba': 'Developing',
    'Egypt': 'Developing',
    'Jordan': 'Developing',
    'Bulgaria': 'Developing',
    'Guadeloupe': 'Developed',
    'Saint Kitts and Nevis': 'Developing',
    'Montserrat': 'Developing',
    'Poland': 'Developed',
    'New Zealand': 'Developed',
    'Dominica': 'Developing',
    'Dominican Republic (the)': 'Developing',
    'Nicaragua': 'Developing',
    'Armenia': 'Developing',
    'Belize': 'Developing',
    'Fiji': 'Developing',
    'Honduras': 'Developing',
    'Solomon Islands': 'Underdeveloped',
    'Trinidad and Tobago': 'Developing',
    'El Salvador': 'Developing',
    'Korea (the Republic of)': 'Developed',
    'Norway': 'Developed',
    'Papua New Guinea': 'Underdeveloped',
    'Australia': 'Developed',
    'Ghana': 'Developing',
    'New Caledonia': 'Developed',
    'Vanuatu': 'Developing',
    'Spain': 'Developed',
    'Yemen Arab Rep': 'Underdeveloped',
    'Cook Islands (the)': 'Developed',
    'Yugoslavia': 'Historical',
    'Libya': 'Developing',
    'Azores Islands': 'Developed',
    'Tonga': 'Developing',
    'Bermuda': 'Developed',
    'Brazil': 'Developing',
    'Réunion': 'Developed',
    'Antigua and Barbuda': 'Developing',
    'Venezuela (Bolivarian Republic of)': 'Developing',
    'Switzerland': 'Developed',
    'Austria': 'Developed',
    'United Kingdom of Great Britain and Northern Ireland (the)': 'Developed',
    'South Africa': 'Developing',
    'Cyprus': 'Developed',
    'Netherlands (the)': 'Developed',
    'Viet Nam': 'Developing',
    'Afghanistan': 'Underdeveloped',
    'Netherlands Antilles': 'Developed',
    'Iraq': 'Developing',
    'Nepal': 'Developing',
    'Barbados': 'Developed',
    'Lebanon': 'Developing',
    'Mozambique': 'Underdeveloped',
    'Sri Lanka': 'Developing',
    'Mongolia': 'Developing',
    'Canary Is': 'Developed',
    'Tunisia': 'Developing',
    'French Polynesia': 'Developed',
    'Niue': 'Developed',
    'Saint Lucia': 'Developing',
    'Mauritius': 'Developing',
    'Ethiopia': 'Underdeveloped',
    'Somalia': 'Underdeveloped',
    'Guam': 'Developed',
    'Thailand': 'Developing',
    'Grenada': 'Developing',
    'Paraguay': 'Developing',
    'Kenya': 'Developing',
    'Panama': 'Developing',
    'Saudi Arabia': 'Developing',
    'Tanzania, United Republic of': 'Developing',
    'Samoa': 'Developing',
    'Bolivia (Plurinational State of)': 'Developing',
    'Botswana': 'Developing',
    'Malaysia': 'Developing',
    'Wallis and Futuna': 'Developed',
    'American Samoa': 'Developed',
    'Congo (the Democratic Republic of the)': 'Underdeveloped',
    "Lao People's Democratic Republic (the)": 'Underdeveloped',
    'Togo': 'Underdeveloped',
    'Uganda': 'Underdeveloped',
    'Congo (the)': 'Underdeveloped',
    'Malawi': 'Underdeveloped',
    'Syrian Arab Republic': 'Developing',
    'Uruguay': 'Developed',
    'Lesotho': 'Underdeveloped',
    'Madagascar': 'Underdeveloped',
    'Benin': 'Underdeveloped',
    'Suriname': 'Developing',
    'Hungary': 'Developed',
    'Cameroon': 'Developing',
    'Guyana': 'Developing',
    'Kiribati': 'Underdeveloped',
    'Tuvalu': 'Underdeveloped',
    'Yemen P Dem Rep': 'Underdeveloped',
    'Central African Republic': 'Underdeveloped',
    'Iceland': 'Developed',
    'Czechoslovakia': 'Historical',
    'Rwanda': 'Developing',
    'Sudan (the)': 'Underdeveloped',
    'Sierra Leone': 'Underdeveloped',
    'Zimbabwe': 'Underdeveloped',
    'Denmark': 'Developed',
    'Sweden': 'Developed',
    'Djibouti': 'Underdeveloped',
    'Oman': 'Developing',
    'Nigeria': 'Developing',
    'Zambia': 'Developing',
    'Côte d’Ivoire': 'Developing',
    'Guinea': 'Underdeveloped',
    'Liberia': 'Underdeveloped',
    'Angola': 'Underdeveloped',
    'Namibia': 'Developing',
    'Eswatini': 'Developing',
    'Ireland': 'Developed',
    'Luxembourg': 'Developed',
    'Sao Tome and Principe': 'Underdeveloped',
    'Germany Dem Rep': 'Historical',
    'Turks and Caicos Islands (the)': 'Developed',
    'Micronesia (Federated States of)': 'Developing',
    'Cambodia': 'Developing',
    'Maldives': 'Developing',
    "Korea (the Democratic People's Republic of)": 'Underdeveloped',
    'Gabon': 'Developing',
    'Burundi': 'Underdeveloped',
    'Virgin Island (U.S.)': 'Developed',
    'Germany': 'Developed',
    'Finland': 'Developed',
    'Georgia': 'Developing',
    'Marshall Islands (the)': 'Developing',
    'Yemen': 'Underdeveloped',
    'Kyrgyzstan': 'Developing',
    'Lithuania': 'Developed',
    'Russian Federation (the)': 'Developing',
    'Serbia Montenegro': 'Historical',
    'Tajikistan': 'Developing',
    'Ukraine': 'Developing',
    'Uzbekistan': 'Developing',
    'Belarus': 'Developing',
    'Eritrea': 'Underdeveloped',
    'Macao': 'Developed',
    'Macedonia (the former Yugoslav Republic of)': 'Developing',
    'Turkmenistan': 'Developing',
    'Bhutan': 'Developing',
    'Moldova (the Republic of)': 'Developing',
    'Azerbaijan': 'Developing',
    'Czech Republic (the)': 'Developed',
    'French Guiana': 'Developed'
}


In [16]:
# Crear nueva columna 'Development' basada en 'Country'
df['Development'] = df['Country'].map(country_development)

# Mostrar un ejemplo
display(df[['Country', 'Development']].head())

Unnamed: 0,Country,Development
0,Cabo Verde,Developing
1,India,Developing
2,Jamaica,Developing
3,Japan,Developed
4,Turkey,Developing


In [17]:
#Comprobamos si ha llegado a todos los países y vemos que no, que todavía quedan algunos como nulos. Creamos un df con ellos
nan_Development = df[df['Development'].isna()]
nan_Development.head()

Unnamed: 0,Year,Country,ISO,Disaster Group,Disaster Subroup,Disaster Type,Disaster Subtype,Total Events,Total Affected,Total Deaths,"Total Damage (USD, original)","Total Damage (USD, adjusted)",CPI,continent,Development
4142,1996,Croatia,HRV,Natural,Geophysical,Earthquake,Ground movement,1,2000.0,,,,535955608540084,Europe,
4239,1996,Virgin Island (British),VGB,Natural,Meteorological,Storm,Tropical cyclone,1,3.0,,2000000.0,3731652.0,535955608540084,Americas,
4323,1997,Croatia,HRV,Natural,Climatological,Wildfire,Forest fire,1,,,,,548484541709928,Europe,
4351,1997,Kuwait,KWT,Natural,Hydrological,Flood,Flash flood,1,200.0,2.0,,,548484541709928,Middle East,
4405,1997,Slovakia,SVK,Natural,Hydrological,Flood,Riverine flood,1,,,60000000.0,109392326.0,548484541709928,Europe,


In [18]:
# Lista de países con valores úncios en nan_development. Con esta lista volvemos a ChatGPT y volvemos a pedirle la clasificación
print(f"Sus valores únicos son: {nan_Development['Country'].unique()}\n")


Sus valores únicos son: ['Croatia' 'Virgin Island (British)' 'Kuwait' 'Slovakia' 'Seychelles'
 'Brunei Darussalam' 'Slovenia' 'Bosnia and Herzegovina' 'Latvia'
 'Cayman Islands (the)' 'Saint Helena, Ascension and Tristan da Cunha'
 'Timor-Leste' 'Northern Mariana Islands (the)' 'Estonia' 'Montenegro'
 'Serbia' 'Palestine, State of' 'South Sudan' 'Palau'
 'United Arab Emirates (the)' 'Saint Barthélemy'
 'Saint Martin (French Part)' 'Sint Maarten (Dutch part)' 'Qatar'
 'Isle of Man']



In [19]:
# Esta es la clasificación con los países que restantes
country_development_new = {
    'Croatia': 'Developed',
    'Virgin Island (British)': 'Developed',
    'Kuwait': 'Developed',
    'Slovakia': 'Developed',
    'Seychelles': 'Developing',
    'Brunei Darussalam': 'Developed',
    'Slovenia': 'Developed',
    'Bosnia and Herzegovina': 'Developing',
    'Latvia': 'Developed',
    'Cayman Islands (the)': 'Developed',
    'Saint Helena, Ascension and Tristan da Cunha': 'Underdeveloped',
    'Timor-Leste': 'Underdeveloped',
    'Northern Mariana Islands (the)': 'Developed',
    'Estonia': 'Developed',
    'Montenegro': 'Developing',
    'Serbia': 'Developing',
    'Palestine, State of': 'Underdeveloped',
    'South Sudan': 'Underdeveloped',
    'Palau': 'Developing',
    'United Arab Emirates (the)': 'Developed',
    'Saint Barthélemy': 'Developed',
    'Saint Martin (French Part)': 'Developed',
    'Sint Maarten (Dutch part)': 'Developed',
    'Qatar': 'Developed',
    'Isle of Man': 'Developed'
}


In [20]:
#Sustituimos los nan de Development de los países que quedan con los valores de la nueva lista

df['Development'] = df['Development'].fillna(df['Country'].map(country_development_new))

In [21]:
#Comprobamos si quedan nulos.... No. Ya está completo de datos
nan_Development_after = df[df['Development'].isna()]
nan_Development_after.head()

Unnamed: 0,Year,Country,ISO,Disaster Group,Disaster Subroup,Disaster Type,Disaster Subtype,Total Events,Total Affected,Total Deaths,"Total Damage (USD, original)","Total Damage (USD, adjusted)",CPI,continent,Development



# Añadimos columna de impacto financiero 
Teniendo en cuenta el índice de precios al consumo, ajustamos las pérdidas económicas a un año base donde el CPI (IPC) sea igual a 100

Pérdidas ajustadas al CPI = Pérdidas en USD x CPI base / CPI

In [22]:
# Pasamos los valores de CPI de tipo object que ahora son a float, para poder operar con ellos
df['CPI'] = pd.to_numeric(df['CPI'], errors='coerce')

In [23]:
CPI_base = 100

df['adjusted_loss_CPI'] = df['Total Damage (USD, adjusted)'] * (CPI_base / df['CPI'])


In [24]:
df.sample(10)

Unnamed: 0,Year,Country,ISO,Disaster Group,Disaster Subroup,Disaster Type,Disaster Subtype,Total Events,Total Affected,Total Deaths,"Total Damage (USD, original)","Total Damage (USD, adjusted)",CPI,continent,Development,adjusted_loss_CPI
2740,1987,Saint Lucia,LCA,Natural,Meteorological,Storm,Tropical cyclone,1,,,,,,Americas,Developing,
4269,1997,Bulgaria,BGR,Natural,Hydrological,Flood,Riverine flood,1,200.0,,,,,Europe,Developing,
2979,1989,Brazil,BRA,Natural,Meteorological,Storm,,1,45000.0,36.0,,,,Americas,Developing,
6596,2006,Indonesia,IDN,Natural,Climatological,Wildfire,Forest fire,1,200.0,,14000000.0,20324094.0,,Asia,Developing,
3963,1995,Ethiopia,ETH,Natural,Hydrological,Flood,Riverine flood,1,93875.0,27.0,500000.0,960259.0,,Africa,Underdeveloped,
9073,2017,Kenya,KEN,Natural,Hydrological,Flood,Riverine flood,1,25000.0,26.0,,,,Africa,Developing,
2799,1988,Afghanistan,AFG,Natural,Hydrological,Flood,,1,161000.0,,260000000.0,643424108.0,,Asia,Underdeveloped,
3958,1995,Egypt,EGY,Natural,Meteorological,Extreme temperature,Heat wave,1,,32.0,,,,Africa,Developing,
1198,1970,Bangladesh,BGD,Natural,Meteorological,Storm,Tropical cyclone,1,3648000.0,300000.0,86400000.0,651265447.0,,Asia,Developing,
6149,2004,Russian Federation (the),RUS,Natural,Geophysical,Earthquake,Ground movement,1,138.0,,,,,Europe/Asia,Developing,


In [25]:
nan_CPI = df[(df['CPI'].notnull()) & (df['Total Damage (USD, adjusted)'].notnull())]

nan_CPI. shape

(64, 16)

Son pocos los países que tienen los dos datos necesarios para hacer el cálculo, por lo que cualquier estudio sobre ellos no sería muy válido. A no ser que queramos puntualizar algo

Comprobación si la ausencia de datos en 'Total Damage (USD, adjusted)' puede deberse a que en ese país no se realicen cálculos por tratarse de un país subdesarrollado
Para descartar los desastres naturales que se producen fuera de núcleos habitados, vamos a incluir que existan afectados

In [26]:
# Vemos los valores únicos de Development
df['Development'].unique()

array(['Developing', 'Developed', 'Historical', 'Underdeveloped'],
      dtype=object)

El valor 'Historical' se refiere a países que actualmente no existen con ese nombre

In [27]:
# Comprobamos las filas donde 'Total Damage (USD, adjusted)' y 'Total Affected' son nulos
nan_Total_Damage = df[(df['Total Damage (USD, adjusted)'].isna()) & (df['Total Affected'].isna())]

# Comprobamos las filas donde 'Total Damage (USD, adjusted)' es nulo y 'Development' es 'Underdeveloped'
nan_Total_Damage0 = df[(df['Total Damage (USD, adjusted)'].isna()) & (df['Total Affected'].isna()) & (df['Development'] == 'Underdeveloped')]

# Comprobamos las filas donde 'Total Damage (USD, adjusted)' es nulo y 'Development' es 'Developing'
nan_Total_Damage1 = df[(df['Total Damage (USD, adjusted)'].isna()) & (df['Total Affected'].isna()) & (df['Development'] == 'Developing')]

# Comprobamos las filas donde 'Total Damage (USD, adjusted)' es nulo y 'Development' es 'Developed'
nan_Total_Damage2 = df[(df['Total Damage (USD, adjusted)'].isna()) & (df['Total Affected'].isna()) & (df['Development'] == 'Developed')]

# Comprobamos las filas donde 'Total Damage (USD, adjusted)' es nulo y 'Development' es 'Historical'
nan_Total_Damage3 = df[(df['Total Damage (USD, adjusted)'].isna()) & (df['Total Affected'].isna()) & (df['Development'] == 'Historical')]



print (f' El total de nulos de Total Damage (USD, adjusted) es {nan_Total_Damage.shape[0]}, que corresponde a {nan_Total_Damage['Country'].nunique()} paises')
print ('-------------------------------------------------')
print (f' El total de registros nulos en Total Damage (USD, adjusted) que pertenece a países subdesarrollados es de {nan_Total_Damage0.shape[0]}, que corresponde a {nan_Total_Damage0['Country'].nunique()} paises')
print ('-------------------------------------------------')
print (f' El total de registros nulos en Total Damage (USD, adjusted) que pertenece a países en vías de desarrollo es de {nan_Total_Damage1.shape[0]}, que corresponde a {nan_Total_Damage1['Country'].nunique()} paises')
print ('-------------------------------------------------')
print (f' El total de registros nulos en Total Damage (USD, adjusted) que pertenece a países desarrollados es de {nan_Total_Damage2.shape[0]}, que corresponde a {nan_Total_Damage2['Country'].nunique()} paises')
print ('-------------------------------------------------')
print (f' El total de registros nulos en Total Damage (USD, adjusted) que pertenece a países desaparecidos es de {nan_Total_Damage3.shape[0]}, que corresponde a {nan_Total_Damage3['Country'].nunique()} paises')




 El total de nulos de Total Damage (USD, adjusted) es 2115, que corresponde a 205 paises
-------------------------------------------------
 El total de registros nulos en Total Damage (USD, adjusted) que pertenece a países subdesarrollados es de 241, que corresponde a 43 paises
-------------------------------------------------
 El total de registros nulos en Total Damage (USD, adjusted) que pertenece a países en vías de desarrollo es de 1100, que corresponde a 95 paises
-------------------------------------------------
 El total de registros nulos en Total Damage (USD, adjusted) que pertenece a países desarrollados es de 743, que corresponde a 62 paises
-------------------------------------------------
 El total de registros nulos en Total Damage (USD, adjusted) que pertenece a países desaparecidos es de 31, que corresponde a 5 paises


In [28]:
# Vamos a ver qué países desarrollados no tienen datos
nan_Total_Damage2['Country'].unique()

array(['Japan', 'Martinique', 'Israel', 'United States of America (the)',
       'Belgium', 'Italy', 'Canada', 'France', 'Portugal',
       'Taiwan (Province of China)', 'Greece', 'Chile', 'Hong Kong',
       'Bahamas (the)', 'Guadeloupe', 'Poland', 'New Zealand',
       'Korea (the Republic of)', 'Norway', 'New Caledonia',
       'Cook Islands (the)', 'Azores Islands', 'Bermuda', 'Réunion',
       'Austria',
       'United Kingdom of Great Britain and Northern Ireland (the)',
       'Spain', 'Netherlands Antilles', 'Australia', 'Barbados',
       'Canary Is', 'French Polynesia', 'Puerto Rico', 'Switzerland',
       'Wallis and Futuna', 'American Samoa', 'Niue', 'Netherlands (the)',
       'Iceland', 'Denmark', 'Sweden', 'Cyprus', 'Germany Fed Rep',
       'Ireland', 'Luxembourg', 'Hungary',
       'Turks and Caicos Islands (the)', 'Germany', 'Croatia',
       'Lithuania', 'Slovakia', 'Latvia', 'Czech Republic (the)',
       'Northern Mariana Islands (the)', 'Virgin Island (U.S.)',
   

Podemos hacer dos cosas:
- Considerar todos los desastres ocurridos en zonas deshabitadas, aunque no sea así
- Dejar estos como nan y marcar el resto como 0 - que sí serían los que han ocurrido fuera de zonas habitadas- para distinguir los que no tenemos datos realmente

In [29]:
# conteo historico en Development
conteo = df['Development'].value_counts()

conteo

Development
Developing        5900
Developed         2831
Underdeveloped    1610
Historical          90
Name: count, dtype: int64

In [30]:
filtro = df[df['Development'] == 'Historical']

display(filtro)

Unnamed: 0,Year,Country,ISO,Disaster Group,Disaster Subroup,Disaster Type,Disaster Subtype,Total Events,Total Affected,Total Deaths,"Total Damage (USD, original)","Total Damage (USD, adjusted)",CPI,continent,Development,adjusted_loss_CPI
13,1902,Soviet Union,SUN,Natural,Geophysical,Earthquake,Ground movement,2,142652.0,4648.0,,,,Unknown,Historical,
86,1911,Soviet Union,SUN,Natural,Geophysical,Earthquake,Ground movement,1,,90.0,,,,Unknown,Historical,
142,1921,Soviet Union,SUN,Natural,Climatological,Drought,Drought,1,5000000.0,1200000.0,,,,Unknown,Historical,
160,1923,Soviet Union,SUN,Natural,Meteorological,Storm,Convective storm,1,,23.0,,,,Unknown,Historical,
235,1930,Soviet Union,SUN,Natural,Geophysical,Earthquake,Ground movement,1,208.0,151.0,,,,Unknown,Historical,
382,1943,Yugoslavia,YUG,Natural,Geophysical,Earthquake,Ground movement,1,,19.0,,,,Unknown,Historical,
417,1946,Soviet Union,SUN,Natural,Geophysical,Earthquake,Ground movement,1,,400.0,,,,Unknown,Historical,
451,1948,Soviet Union,SUN,Natural,Geophysical,Earthquake,Ground movement,1,,110000.0,25000000.0,303834800.0,,Unknown,Historical,
467,1949,Soviet Union,SUN,Natural,Geophysical,Earthquake,Ground movement,1,,3500.0,,,,Unknown,Historical,
468,1949,Soviet Union,SUN,Natural,Hydrological,Landslide,,1,,12000.0,,,,Unknown,Historical,


In [31]:
valores_unicos = filtro['Country'].unique()

display(valores_unicos)

array(['Soviet Union', 'Yugoslavia', 'Czechoslovakia', 'Germany Dem Rep',
       'Serbia Montenegro'], dtype=object)

Tenemos un total de 90 valores con Development igual a Historical, no dedicaria mas tiempo a modificarlo 

In [32]:
exploracion(df)

El DataFrame tiene 10431 filas y 16 columnas.
Tiene 0 datos duplicados, lo que supone un porcentaje de 0.0% de los datos.
Hay 7 columnas con datos nulos, y son:
['Disaster Subtype', 'Total Affected', 'Total Deaths', 'Total Damage (USD, original)', 'Total Damage (USD, adjusted)', 'CPI', 'adjusted_loss_CPI']
y sin nulos hay 9 columnas y son:
['Year', 'Country', 'ISO', 'Disaster Group', 'Disaster Subroup', 'Disaster Type', 'Total Events', 'continent', 'Development']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Year,0.0%,100.0%,int64,124
Country,0.0%,100.0%,object,225
ISO,0.0%,100.0%,object,225
Disaster Group,0.0%,100.0%,object,1
Disaster Subroup,0.0%,100.0%,object,5


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
Country,10431,225,United States of America (the),405
ISO,10431,225,USA,405
Disaster Group,10431,1,Natural,10431
Disaster Subroup,10431,5,Hydrological,4489
Disaster Type,10431,13,Flood,3837
Disaster Subtype,8298,25,Riverine flood,1628
continent,10431,8,Asia,3293
Development,10431,4,Developing,5900


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,10431.0,1995.61,22.00119,1900.0,1986.0,2001.0,2011.0,2023.0
Total Events,10431.0,1.446649,1.246589,1.0,1.0,1.0,1.0,20.0
Total Affected,7586.0,1125969.0,9760891.0,1.0,1200.0,11414.0,119304.5,330000000.0
Total Deaths,7375.0,3107.711,72555.89,1.0,6.0,23.0,90.0,3700000.0
"Total Damage (USD, original)",3834.0,1122262000.0,6792339000.0,2000.0,10000000.0,68000000.0,400000000.0,210000000000.0
"Total Damage (USD, adjusted)",3830.0,1748704000.0,9115319000.0,2469.0,20209265.5,146924694.0,784776700.0,273218400000.0
CPI,235.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
adjusted_loss_CPI,64.0,3497469000.0,13159710000.0,240000.0,25750000.0,147700000.0,1230750000.0,101000000000.0


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Year,0.0%,100.0%,int64,124
Country,0.0%,100.0%,object,225
ISO,0.0%,100.0%,object,225
Disaster Group,0.0%,100.0%,object,1
Disaster Subroup,0.0%,100.0%,object,5
Disaster Type,0.0%,100.0%,object,13
Disaster Subtype,20.45%,79.55%,object,25
Total Events,0.0%,100.0%,int64,19
Total Affected,27.27%,72.73%,float64,4043
Total Deaths,29.3%,70.7%,float64,815


In [33]:
# creamos una lista con los nombres de las columnas categoricas 
columnas = df.select_dtypes(include='object').columns.tolist()
print(columnas)
# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas:
    print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
    print(f"Sus valores únicos son: {df[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de las categorías son: {df[columna].value_counts()} ")

['Country', 'ISO', 'Disaster Group', 'Disaster Subroup', 'Disaster Type', 'Disaster Subtype', 'continent', 'Development']
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'COUNTRY' -----------

Sus valores únicos son: ['Cabo Verde' 'India' 'Jamaica' 'Japan' 'Turkey'
 'United States of America (the)' 'China' 'Guatemala' 'Myanmar'
 'Martinique' 'Soviet Union' 'Saint Vincent and the Grenadines' 'Canada'
 'Comoros (the)' 'Iran (Islamic Republic of)' 'Israel' 'Niger (the)'
 'Bangladesh' 'Greece' 'Taiwan (Province of China)' 'Albania' 'Italy'
 'Philippines (the)' 'Belgium' 'Chile' 'Colombia' 'Hong Kong' 'Romania'
 'France' 'Haiti' 'Morocco' 'Pakistan' 'Portugal' 'Burkina Faso'
 'Costa Rica' 'Algeria' 'Gambia (the)' 'Guinea-Bissau' 'Mali' 'Mauritania'
 'Senegal' 'Chad' 'Kazakhstan' 'Mexico' 'Indonesia' 'Peru' 'Tokelau'
 'Puerto Rico' 'Anguilla' 'Argentina' 'Germany Fed Rep' 'Ecuador'
 'Bahamas (the)' 'Cuba' 'Egypt' 'Jordan' 'Bulgaria' 'Guadeloupe'
 'Saint Kitts and Nevis' 'Montserrat' 'Poland' 'N

# GUARDAMOS CSV DESPUES DE APLICAR EDA PARA TRABAJAR EN TABLEAU ✨

In [34]:
df.to_csv("archivos/EDA_aplicado_natural_disasters.csv",index=False)