In [53]:
import pandas as pd

BASE = "../datasets/HR-Employee-Attrition.csv"

df = pd.read_csv(BASE)

#### Limpando a base de colunas com valores não variam e removendo a coluna de identificação (EmployeeNumber)

In [54]:
df = df.drop("EmployeeNumber", axis=1)

for c in df.columns:
    column = df[c]
    if (column.nunique() <= 1):
        print(c) 
        df = df.drop(c, axis=1)

EmployeeCount
Over18
StandardHours


#### Separando nosso dados por variáveis categóricas ordenadas e não ordernadas

In [55]:

# Colunas que são categóricas e não ordenadas
columns_categorical_not_ordered = [
    'BusinessTravel',
    'Department',
    'EducationField',
    'Gender',
    'JobRole',
    'MaritalStatus',
    'OverTime'
]

# Colunas que são categóricas e ordernadas
columns_categorical_ordered = [
    'Education',
    'EnvironmentSatisfaction',
    'JobSatisfaction',
    'JobInvolvement',
    'JobLevel',
    'PerformanceRating',
    'RelationshipSatisfaction',
    'StockOptionLevel',
    'WorkLifeBalance',
    'JobInvolvement'
]

target_column = ['Attrition']


numeric_columns = [
    column
    for column in df.columns
    if column not in (columns_categorical_ordered + columns_categorical_not_ordered + target_column)
]

numeric_columns

#### Visualizando a variação das nossas colunas categóricas

In [56]:
for column in (columns_categorical_not_ordered + columns_categorical_ordered ):
    print(f"{column}: {df[column].unique()}")

BusinessTravel: ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Department: ['Sales' 'Research & Development' 'Human Resources']
EducationField: ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
Gender: ['Female' 'Male']
JobRole: ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
MaritalStatus: ['Single' 'Married' 'Divorced']
OverTime: ['Yes' 'No']
Education: [2 1 4 3 5]
EnvironmentSatisfaction: [2 3 4 1]
JobSatisfaction: [4 2 3 1]
JobInvolvement: [3 2 4 1]
JobLevel: [2 1 3 4 5]
PerformanceRating: [3 4]
RelationshipSatisfaction: [1 4 2 3]
StockOptionLevel: [0 1 3 2]
WorkLifeBalance: [1 3 2 4]
JobInvolvement: [3 2 4 1]


### Convertendo nossas colunas para o tipo "category" do pandas:

Ao converter colunas para o tipo category do pandas ganhamos beneficios, dentre eles performance, utilizando o `.info()` podemos ver que nosso DataFrame possui 356 kb

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EnvironmentSatisfaction   1470 non-null   int64 
 9   Gender                    1470 non-null   object
 10  HourlyRate                1470 non-null   int64 
 11  JobInvolvement            1470 non-null   int64 
 12  JobLevel                  1470 non-null   int64 
 13  JobRole                   1470 non-null   object
 14  JobSatisfaction         

In [59]:
# Convertendo nossas colunas não ordernadas para o tipo category

for column in columns_categorical_not_ordered:
    df[column] = df[column].astype("category")

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Age                       1470 non-null   int64   
 1   Attrition                 1470 non-null   object  
 2   BusinessTravel            1470 non-null   category
 3   DailyRate                 1470 non-null   int64   
 4   Department                1470 non-null   category
 5   DistanceFromHome          1470 non-null   int64   
 6   Education                 1470 non-null   int64   
 7   EducationField            1470 non-null   category
 8   EnvironmentSatisfaction   1470 non-null   int64   
 9   Gender                    1470 non-null   category
 10  HourlyRate                1470 non-null   int64   
 11  JobInvolvement            1470 non-null   int64   
 12  JobLevel                  1470 non-null   int64   
 13  JobRole                   1470 non-null   catego

Nosso DataFrame passou a ter 287 kb, uma economia de 69 kb, em Datasets de maior escalo isso é evidentemente um ganho ainda mais significativo.

Além da economia do armazenamento o processo de conversão para o tipo "category" muda outras coisas na nossa base, pois uma vez que mudamos a tipagem estamos instanciando uma outra classe do pandas e que nos dará novos atributos e métodos que não tinhamos antes no tipo "object":

In [77]:
# Percorrendo nossas colunas do tipo "category":

for column in df.select_dtypes(include="category"):
    print(column)
    print(f"{column}.cat.categories: {df[column].cat.categories}")
    print(f"{column}.cat.codes: {df[column].cat.codes.unique()}")
    print()


# A propriedade categories vai nos retornar as diferentes categorias vinculadas
#a uma coluna

BusinessTravel
BusinessTravel.cat.categories: Index(['Non-Travel', 'Travel_Frequently', 'Travel_Rarely'], dtype='object')
BusinessTravel.cat.codes: [2 1 0]

Department
Department.cat.categories: Index(['Human Resources', 'Research & Development', 'Sales'], dtype='object')
Department.cat.codes: [2 1 0]

EducationField
EducationField.cat.categories: Index(['Human Resources', 'Life Sciences', 'Marketing', 'Medical', 'Other',
       'Technical Degree'],
      dtype='object')
EducationField.cat.codes: [1 4 3 2 5 0]

Gender
Gender.cat.categories: Index(['Female', 'Male'], dtype='object')
Gender.cat.codes: [0 1]

JobRole
JobRole.cat.categories: Index(['Healthcare Representative', 'Human Resources', 'Laboratory Technician',
       'Manager', 'Manufacturing Director', 'Research Director',
       'Research Scientist', 'Sales Executive', 'Sales Representative'],
      dtype='object')
JobRole.cat.codes: [7 6 2 4 0 3 8 5 1]

MaritalStatus
MaritalStatus.cat.categories: Index(['Divorced', 'Married', 

### Entendendo o motivo do pandas conseguir economia de memória com o tipo "category"

In [70]:
# Ao listarmos os valores da coluna "OverTime" veremos um texto com "Yes" ou "No"

df["OverTime"].head()

0    Yes
1     No
2    Yes
3    Yes
4     No
Name: OverTime, dtype: category
Categories (2, object): ['No', 'Yes']

In [72]:
# Mas quando visualizamos os valores através da propriedade "codes" vamos ver
#que "Yes" e "No" é convertido para 0 e 1. O tipo int8 ocupa menos espaço de armazenamento

df["OverTime"].cat.codes.head()

0    1
1    0
2    1
3    1
4    0
dtype: int8