## Ejercicios de pair programming 23 enero: Encoding

In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd

# Para la codificación de las variables numéricas
# -----------------------------------------------------------------------
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding

# Para evitar que salgan los warnings en jupyter
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [58]:
df = pd.read_csv("../datos/world_risk_index_sin_outliers_est.csv", index_col = 0)
df.head(2)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn
0,papua new guinea,Very High,Very High,Very High,Very High,2.90648,23.26,1.296928,1.179006,0.962932,1.537045,2011.0,0.895683
1,madagascar,Very High,Very High,Very High,Very High,2.594391,20.68,1.545395,2.260942,1.017385,0.974085,2011.0,0.792566


In [59]:
outliers = pd.read_csv("../datos/world_risk_index_outliers_est.csv", index_col = 0)
outliers.head(2)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn
0,vanuatu,Very High,Very High,High,High,1.640675,56.33,0.801253,0.792708,0.541556,0.926242,2011.0,0.563758
1,tonga,Very High,Very High,Medium,Medium,1.29257,56.04,0.376459,0.030528,0.707655,0.185736,2011.0,0.560853


### Info columnas
|Columna| Tipo de dato | Descripcion |
|-------|--------------|-------------|
|Region| String|	Name of the region.
|WRI	| Decimal |	World Risk Score of the region.
|Exposure	| Decimal |	Risk/exposure to natural hazards such as earthquakes, hurricanes, floods, droughts, and sea ​​level rise.
|Vulnerability	| Decimal |	Vulnerability depending on infrastructure, nutrition, housing situation, and economic framework conditions.
|Susceptibility	| Decimal |	Susceptibility depending on infrastructure, nutrition, housing situation, and economic framework conditions.
|Lack of Coping Capabilities	| Decimal |	Coping capacities in dependence of governance, preparedness and early warning, medical care, and social and material security.
|Lack of Adaptive Capacities| Decimal |	Adaptive capacities related to coming natural events, climate change, and other challenges.
|Year	| Decimal |	Year data is being described.
|WRI Category| String|	WRI Category for the given WRI Score.
|Exposure Category| String|	Exposure Category for the given Exposure Score.
|Vulnerability Categoy| String|	Vulnerability Category for the given Vulnerability Score.
|Susceptibility Category| String|	Susceptibility Category for the given Susceptibility Score.

Link a la base de datos : https://www.kaggle.com/datasets/tr1gg3rtrash/global-disaster-risk-index-time-series-dataset

### Nuestra variable respuesta es Exposure_Sklearn, queremos saber cual es el riesgo de desastres naturales dependiendo del resto de variables



---

### df limpio

---

Decidimos no hacer encoding de la variable región ya que tiene demasiados valores únicos y consideramos según el análisis previo que no tiene la suficiente importancia en la predicción de la variable.  
Tampoco lo haremos de *Exposure Category* por lo que comentamos que esta columna sale de un calculo del *World Rick Score*.

In [60]:
df["wri_category"].unique()  #Vemos los unique de las variables categóricas para comprobar como podemos hacer el encoding

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [61]:
df["vulnerability_category"].unique()

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [62]:
df["susceptibility_category"].unique()

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [63]:
def encoder_map(df, columna, orden_valores): #Usamos la función para generar valores del 1 al 5 para hacer el encoding de nuestras variables categóricas
    ordinal_dict = {}
    for i, valor in enumerate(orden_valores):
        ordinal_dict[valor]=i+1   

    columna_nueva =columna + "map"

    df[columna_nueva] = df[columna].map(ordinal_dict)
    return df

In [64]:
orden_valores = ['Very Low', "Low","Medium","High",'Very High'] #Ponemos el valor de nuestras categorías de menor a mayor

In [65]:
df = encoder_map(df, "wri_category", orden_valores) #Aplicamos la función en todas nuestras columnas categóricas
df.sample(5)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,wri_categorymap
498,suriname,Very High,High,Medium,Medium,0.690337,18.12,-0.117912,-0.172949,0.041881,-0.203869,2014.0,0.690248,4
216,kenya,Low,Medium,Very High,Very High,0.249,10.69,1.248655,1.544399,1.000783,0.922052,2013.0,0.393285,3
272,argentina,Low,Low,Low,Medium,-0.778685,9.55,-0.621944,-0.514189,-0.698546,-0.537742,2013.0,0.347722,2
160,haiti,High,Very High,Very High,Very High,1.78107,16.26,1.768305,1.897406,1.281681,1.791425,2013.0,0.615907,5
493,mozambique,Medium,High,Very High,Very High,0.882634,12.73,1.614965,2.160613,0.917776,1.412746,2014.0,0.47482,4


In [66]:
df = encoder_map(df, "susceptibility_category", orden_valores)
df = encoder_map(df, "vulnerability_category", orden_valores)

In [67]:
df.sample(5)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,wri_categorymap,susceptibility_categorymap,vulnerability_categorymap
4,gambia,Very High,Very High,High,High,2.417856,22.2,1.028584,0.884833,0.907151,1.122232,2011.0,0.853317,5,4,4
1700,bahrain,Very Low,Very Low,Low,Very Low,-1.431234,4.27,-0.614845,-1.114921,-0.276868,-0.27469,2016.0,0.136691,1,1,2
584,united kingdom,Medium,Low,Very Low,Very Low,-0.848038,11.6,-1.253049,-0.893827,-1.543895,-1.103594,2014.0,0.429656,2,1,1
892,austria,Medium,Low,Very Low,Very Low,-0.819667,13.6,-1.525652,-1.036888,-2.225885,-1.02699,2017.0,0.509592,2,1,1
1102,mali,High,Very High,Very High,Very High,1.428,15.68,1.455947,1.170336,1.219923,1.752401,2020.0,0.592726,5,5,5


---

### Outliers

---

In [68]:
outliers["wri_category"].unique()

array(['Very High', 'High', 'Medium', 'Very Low'], dtype=object)

In [69]:
outliers["vulnerability_category"].unique()

array(['High', 'Medium', 'Very High', 'Low', 'Very Low'], dtype=object)

In [70]:
outliers["susceptibility_category"].unique()

array(['High', 'Medium', 'Very High', 'Low', 'Very Low'], dtype=object)

Para las variables de vulnerability_category y susceptibility_category podemos utilizar la misma formula que hemos usado para el DF, pero tendremos que asignar diferentes valores al *map*

In [71]:
outliers = encoder_map(outliers, "vulnerability_category", orden_valores) 
outliers = encoder_map(outliers, "susceptibility_category", orden_valores)
outliers.head()

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,vulnerability_categorymap,susceptibility_categorymap
0,vanuatu,Very High,Very High,High,High,1.640675,56.33,0.801253,0.792708,0.541556,0.926242,2011.0,0.563758,4,4
1,tonga,Very High,Very High,Medium,Medium,1.29257,56.04,0.376459,0.030528,0.707655,0.185736,2011.0,0.560853,3,3
2,philippines,Very High,Very High,High,High,0.72511,45.09,0.552087,0.592868,0.773824,0.106661,2011.0,0.451167,4,4
3,solomon islands,Very High,Very High,Very High,High,0.628547,36.4,1.475212,1.440562,0.987862,1.731821,2011.0,0.364119,5,4
4,guatemala,Very High,Very High,High,High,0.315014,38.42,0.588423,0.627259,0.439602,0.589349,2011.0,0.384353,4,4


In [72]:
mapa2 = {'Very Low': 1, "Medium": 3, "High": 4, 'Very High': 5}
outliers['wri_categorymap'] = outliers['wri_category'].map(mapa2)

In [73]:
outliers.sample(5)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,vulnerability_categorymap,susceptibility_categorymap,wri_categorymap
106,japan,Very High,Very High,Very Low,Low,-0.568363,45.91,-1.576206,-1.054185,-2.241607,-0.832358,2017.0,0.459381,1,2,5
1,tonga,Very High,Very High,Medium,Medium,1.29257,56.04,0.376459,0.030528,0.707655,0.185736,2011.0,0.560853,3,3,5
122,costa rica,Very High,Very High,Low,Low,-0.103428,44.92,-0.768147,-0.879441,-0.167401,-1.209612,2019.0,0.449464,2,2,5
195,philippines,Very High,Very High,High,High,1.00884,52.46,0.289943,0.29915,0.648238,-0.226938,2016.0,0.524992,4,4,5
115,papua new guinea,Very High,Very High,Very High,Very High,0.469992,32.54,1.784939,2.494602,1.005417,1.660983,2019.0,0.325453,5,5,5


In [75]:
df.drop("exposure_category", axis = 1, inplace=True)
outliers.drop("exposure_category", axis = 1, inplace=True)

In [77]:
df.to_csv("../datos/encoding.csv") #Guardamos nuestros df con los encodings realizados
outliers.to_csv("../datos/outliers_encoding.csv")