# Transformation de donnes categoriques

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import statistics
import numpy as np
import scipy.stats
import seaborn as sns

data = pd.read_csv("titanic.csv")
pd.set_option('display.max_rows', None)
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


# Encoder 1: Custom

In [2]:
data.loc[data['Sex'] == 'male', 'Sex2'] = 0
data.loc[data['Sex'] == 'female', 'Sex2'] = 1

In [4]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex2
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.0
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1.0
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.0
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.0
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1.0


In [6]:
# Select categorical variables

obj_df = data.select_dtypes(include=['object']).copy()
obj_df.head(3)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Kelly, Mr. James",male,330911,,Q
1,"Wilkes, Mrs. James (Ellen Needs)",female,363272,,S
2,"Myles, Mr. Thomas Francis",male,240276,,Q


In [7]:
obj_df.columns

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

# Label Encoder

In [15]:
obj_df["Sex3"] = obj_df["Sex"].astype('category').cat.codes
obj_df.head(3)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Sex3
0,"Kelly, Mr. James",male,330911,,Q,1
1,"Wilkes, Mrs. James (Ellen Needs)",female,363272,,S,0
2,"Myles, Mr. Thomas Francis",male,240276,,Q,1


In [21]:
obj_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      418 non-null    object
 1   Sex       418 non-null    int64 
 2   Ticket    418 non-null    object
 3   Cabin     91 non-null     object
 4   Embarked  418 non-null    object
 5   Sex3      418 non-null    int8  
dtypes: int64(1), int8(1), object(4)
memory usage: 16.9+ KB


# Using Label Encoder sklearn

In [22]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
data['Sex4'] = labelEncoder.fit_transform(data['Sex'])

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex4
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0


# Replace Map

In [16]:
replace_map = {'Sex': {'male': 1, 'female': 2}}

obj_df.replace(replace_map, inplace=True)

In [23]:
obj_df.head(4)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Sex3
0,"Kelly, Mr. James",1,330911,,Q,1
1,"Wilkes, Mrs. James (Ellen Needs)",2,363272,,S,0
2,"Myles, Mr. Thomas Francis",1,240276,,Q,1
3,"Wirz, Mr. Albert",1,315154,,S,1


In [None]:
# Generation de replace_map automatically

In [20]:
labelsSex = data['Sex'].astype('category').cat.categories.tolist()
replace_map_2 = {'Sex' : {k: v for k,v in zip(labelsSex,list(range(1,len(labelsSex)+1)))}}
replace_map_2

{'Sex': {'female': 1, 'male': 2}}

# One Hot Encoder

In [27]:
data2 = data.copy()
data2 = pd.get_dummies(data2, columns=['Sex'], prefix = ['Sex'])

data2.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex4,Sex_female,Sex_male
0,892,0,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,,Q,1,0,1
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,,S,0,1,0
2,894,0,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,,Q,1,0,1
3,895,0,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,,S,1,0,1
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,,S,0,1,0


In [29]:
# Using LabelBinarizer

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
lb_results = lb.fit_transform(data['Sex'])
lb_results_df = pd.DataFrame(lb_results, columns=["Sex"])

print(lb_results_df.head())

   Sex
0    1
1    0
2    1
3    1
4    0
