<a href="https://colab.research.google.com/github/marciusdm/artigos/blob/main/encoders/ArtigoSobreEncoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importando os pacotes necessários

In [14]:
# importar os pacotes necessários
from sklearn import preprocessing
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


#Utilizando o dataset do Titanic


In [15]:
# importar o arquivo
df = pd.read_csv("https://raw.githubusercontent.com/marciusdm/datasets/main/titanic_modified.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,No,Third,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,Yes,First,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,712.833,C85,C
2,3,Yes,Third,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,Yes,First,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,No,Third,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#Uma breve análise exploratoria

In [None]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
#Verificando as variáveis categóricas
print("\n\nValores únicos (por coluna):\n{}\n".format(df.nunique()))
print("\n\nTipos das colunas:\n{}".format(df.dtypes))




Valores únicos (por coluna):
PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           247
Cabin          147
Embarked         3
dtype: int64



Tipos das colunas:
PassengerId      int64
Survived         int64
Pclass          object
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare            object
Cabin           object
Embarked        object
dtype: object


# Eliminado as colunas desnecessárias



In [16]:
df.drop(['PassengerId','Cabin', 'Ticket','Name'], axis='columns', inplace=True)


In [17]:
median_age = df.Age.median()
df = df.fillna({"Embarked": 'S','Age':median_age})


In [None]:
df.isnull().sum()/df.count()

Survived    0.0
Pclass      0.0
Sex         0.0
Age         0.0
SibSp       0.0
Parch       0.0
Fare        0.0
Embarked    0.0
dtype: float64

# A classe LabelEncoder
A classe LabelEncoder é utilizada para codificar a variável alvo. Aqui neste dataset eu substituo os valores 'Yes' e 'No' por 1 e 0 respectivamente

In [None]:
le = preprocessing.LabelEncoder()
df["Survived"] = le.fit_transform(df["Survived"])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,Third,male,22.0,1,0,7.25,S
1,1,First,female,38.0,1,0,712.833,C
2,1,Third,female,26.0,0,0,7.925,S
3,1,First,female,35.0,1,0,53.1,S
4,0,Third,male,35.0,0,0,8.05,S


# A classe OrdinalEncoder

In [None]:
oe = preprocessing.OrdinalEncoder(categories = [['First','Second','Third']], dtype=np.int64)
df[["Pclass"]] = oe.fit_transform(df[["Pclass"]])
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,No,2,male,22.0,1,0,7.25,S
1,Yes,0,female,38.0,1,0,712.833,C
2,Yes,2,female,26.0,0,0,7.925,S
3,Yes,0,female,35.0,1,0,53.1,S
4,No,2,male,35.0,0,0,8.05,S


# A classe OneHotEncoder

In [18]:
#df_dummy = df.copy()
ohe = preprocessing.OneHotEncoder( sparse_output=False)
ohe.set_output(transform='pandas')
#df[["Sex_female","Sex_male","Embarked_C", "Embarked_Q","Embarked_S"]]= ohe.fit_transform(df[["Sex","Embarked"]])
df_dummy = ohe.fit_transform(df[["Sex","Embarked"]])
df_dummy.head()


Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0


# O método pandas.get_dummies

In [12]:
dummy_df = pd.get_dummies(df, columns=['Sex','Embarked'])
dummy_df.head()


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,No,Third,22.0,1,0,7.25,0,1,0,0,1
1,Yes,First,38.0,1,0,712.833,1,0,1,0,0
2,Yes,Third,26.0,0,0,7.925,1,0,0,0,1
3,Yes,First,35.0,1,0,53.1,1,0,0,0,1
4,No,Third,35.0,0,0,8.05,0,1,0,0,1


In [20]:
ohe = preprocessing.OneHotEncoder( sparse_output=False)
#ohe.set_output(transform='pandas')
dados = ohe.fit_transform(df[["Sex","Embarked"]])
df1 = pd.DataFrame(dados, columns=ohe.get_feature_names_out())
df1.head()

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0
