# Codifying context and category data to numeric data to work in ML

In [18]:
import pandas as pd

data = {"name": ["Mariana", "Ana", "Elsa", "Gustavo", "Pedro", "Raúl", "Carlos", "José", "Luis"],
       
       "saldo": [10000.00, 8000.00, 9000.00, 2000.00, 2100.00, 12000.00, 5000.0, 10000.00, 200.00],
       
       "pais": ["Argentina", "Bolivia", "Chile", "Colombia", "Costa Rica", "Ecuador", "Mexico", "Peru", "Peru"]}

datos = pd.DataFrame(data)
datos

Unnamed: 0,name,saldo,pais
0,Mariana,10000.0,Argentina
1,Ana,8000.0,Bolivia
2,Elsa,9000.0,Chile
3,Gustavo,2000.0,Colombia
4,Pedro,2100.0,Costa Rica
5,Raúl,12000.0,Ecuador
6,Carlos,5000.0,Mexico
7,José,10000.0,Peru
8,Luis,200.0,Peru


In [2]:
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    9 non-null      object 
 1   saldo   9 non-null      float64
 2   pais    9 non-null      object 
dtypes: float64(1), object(2)
memory usage: 344.0+ bytes


In [3]:
datos["pais"] = datos["pais"].astype("category")
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   name    9 non-null      object  
 1   saldo   9 non-null      float64 
 2   pais    9 non-null      category
dtypes: category(1), float64(1), object(1)
memory usage: 645.0+ bytes


In [4]:
datos

Unnamed: 0,name,saldo,pais
0,Mariana,10000.0,Argentina
1,Ana,8000.0,Bolivia
2,Elsa,9000.0,Chile
3,Gustavo,2000.0,Colombia
4,Pedro,2100.0,Costa Rica
5,Raúl,12000.0,Ecuador
6,Carlos,5000.0,Mexico
7,José,10000.0,Peru
8,Luis,200.0,Peru


### Error: Inappropiate codifying to replace category data to numeric

In [5]:
datos_sesgados = datos.copy()

reemplazos = {"Argentina":1,
              "Bolivia": 2,
              "Chile": 3,
              "Colombia": 4,
              "Costa Rica": 5,
              "Ecuador": 6,
              "Mexico": 7,
              "Peru": 8}
datos_sesgados["pais"].replace(reemplazos, inplace=True)
datos_sesgados

Unnamed: 0,name,saldo,pais
0,Mariana,10000.0,1
1,Ana,8000.0,2
2,Elsa,9000.0,3
3,Gustavo,2000.0,4
4,Pedro,2100.0,5
5,Raúl,12000.0,6
6,Carlos,5000.0,7
7,José,10000.0,8
8,Luis,200.0,8


It does not use, because will be an error of bias (sesgo)

### Codifying with one-hot category

In [12]:
from sklearn.preprocessing import OneHotEncoder

codificador = OneHotEncoder()
codificacion = codificador.fit_transform(datos[["pais"]]) # To create a sparse matrix - Matriz dispersa
print(type(codificacion))
print(codificacion)
print(codificacion.toarray())  # To see the matrix

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 6)	1.0
  (7, 7)	1.0
  (8, 7)	1.0
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]


In [13]:
nuevas_cols = pd.DataFrame(codificacion.toarray(),
                          columns = codificador.categories_)
nuevas_cols

Unnamed: 0,Argentina,Bolivia,Chile,Colombia,Costa Rica,Ecuador,Mexico,Peru
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


This is the correct way to dont incorporate bias into the model

In [19]:
data = pd.concat([datos, nuevas_cols], axis="columns")
data

Unnamed: 0,name,saldo,pais,"(Argentina,)","(Bolivia,)","(Chile,)","(Colombia,)","(Costa Rica,)","(Ecuador,)","(Mexico,)","(Peru,)"
0,Mariana,10000.0,Argentina,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ana,8000.0,Bolivia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Elsa,9000.0,Chile,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Gustavo,2000.0,Colombia,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Pedro,2100.0,Costa Rica,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,Raúl,12000.0,Ecuador,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,Carlos,5000.0,Mexico,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,José,10000.0,Peru,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,Luis,200.0,Peru,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
data.drop("pais", axis="columns", inplace=True)
data

Unnamed: 0,name,saldo,"(Argentina,)","(Bolivia,)","(Chile,)","(Colombia,)","(Costa Rica,)","(Ecuador,)","(Mexico,)","(Peru,)"
0,Mariana,10000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ana,8000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Elsa,9000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Gustavo,2000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Pedro,2100.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,Raúl,12000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,Carlos,5000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,José,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,Luis,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


We are ready to input to model