#### Pre-procesamiento de datos: Variables Dummy (ONE HOT ENCODING)

In [2]:
import pandas as pd
import numpy as np

In [17]:
dfhr = pd.read_csv(filepath_or_buffer = "/home/pedro/asdf/HRDataset_v14.csv")
dfhr.head()

Unnamed: 0,Employee_Name,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Salary,...,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,LastPerformanceReview_Date,DaysLateLast30,Absences
0,"Adinolfi, Wilson K",10026,0,0,1,1,5,4,0,62506,...,Michael Albert,22.0,LinkedIn,Exceeds,4.6,5,0,1/17/2019,0,1
1,"Ait Sidi, Karthikeyan",10084,1,1,1,5,3,3,0,104437,...,Simon Roup,4.0,Indeed,Fully Meets,4.96,3,6,2/24/2016,0,17
2,"Akinkuolie, Sarah",10196,1,1,0,5,5,3,0,64955,...,Kissy Sullivan,20.0,LinkedIn,Fully Meets,3.02,3,0,5/15/2012,0,3
3,"Alagbe,Trina",10088,1,1,0,1,5,3,0,64991,...,Elijiah Gray,16.0,Indeed,Fully Meets,4.84,5,0,1/3/2019,0,15
4,"Anderson, Carol",10069,0,2,0,5,5,3,0,50825,...,Webster Butler,39.0,Google Search,Fully Meets,5.0,4,0,2/1/2016,0,2


In [22]:
# Creación de dataset con columnas específicas

dfhr1 = dfhr[["Employee_Name", "Position", "State", "Sex", "Salary", "DOB", "MaritalDesc", "CitizenDesc", 
              "Department", "PerformanceScore", "EngagementSurvey", "EmpSatisfaction"]]
dfhr1.head()

Unnamed: 0,Employee_Name,Position,State,Sex,Salary,DOB,MaritalDesc,CitizenDesc,Department,PerformanceScore,EngagementSurvey,EmpSatisfaction
0,"Adinolfi, Wilson K",Production Technician I,MA,M,62506,07/10/83,Single,US Citizen,Production,Exceeds,4.6,5
1,"Ait Sidi, Karthikeyan",Sr. DBA,MA,M,104437,05/05/75,Married,US Citizen,IT/IS,Fully Meets,4.96,3
2,"Akinkuolie, Sarah",Production Technician II,MA,F,64955,09/19/88,Married,US Citizen,Production,Fully Meets,3.02,3
3,"Alagbe,Trina",Production Technician I,MA,F,64991,09/27/88,Married,US Citizen,Production,Fully Meets,4.84,5
4,"Anderson, Carol",Production Technician I,MA,F,50825,09/08/89,Divorced,US Citizen,Production,Fully Meets,5.0,4


#### Codificación en ONE HOT ENCODING utilizando la columna de sexo

In [23]:
dummy_sex = pd.get_dummies(dfhr1["Sex"], prefix = "sex")

dummy_sex.head()

Unnamed: 0,sex_F,sex_M
0,0,1
1,0,1
2,1,0
3,1,0
4,1,0


In [25]:
# Utilizamos drop para eliminar la columna "Sex", ya que no la vamos a utilizar. 
# El parámetro axis = 1 es para especificar que es una columna.

dfhr1 = dfhr1.drop(["Sex"], axis = 1)

KeyError: "['Sex'] not found in axis"

In [26]:
# Ahora, añadimos las columnas dummies con .concat:

dfhr2 = pd.concat([dfhr1, dummy_sex], axis = 1)
dfhr2.head()

Unnamed: 0,Employee_Name,Position,State,Salary,DOB,MaritalDesc,CitizenDesc,Department,PerformanceScore,EngagementSurvey,EmpSatisfaction,sex_F,sex_M
0,"Adinolfi, Wilson K",Production Technician I,MA,62506,07/10/83,Single,US Citizen,Production,Exceeds,4.6,5,0,1
1,"Ait Sidi, Karthikeyan",Sr. DBA,MA,104437,05/05/75,Married,US Citizen,IT/IS,Fully Meets,4.96,3,0,1
2,"Akinkuolie, Sarah",Production Technician II,MA,64955,09/19/88,Married,US Citizen,Production,Fully Meets,3.02,3,1,0
3,"Alagbe,Trina",Production Technician I,MA,64991,09/27/88,Married,US Citizen,Production,Fully Meets,4.84,5,1,0
4,"Anderson, Carol",Production Technician I,MA,50825,09/08/89,Divorced,US Citizen,Production,Fully Meets,5.0,4,1,0


#### Otro ejemplo, pero automatizado. Esta vez, utilizando la columna de la ciudadanía

In [27]:
# Utilizar una función para crear dummies, que convierte una variable categórica en variables cuantitativas

def createdummies(df, varname):
    dummy = pd.get_dummies(df[varname], prefix = varname)
    df = df.drop(varname, axis = 1)
    df = pd.concat([df, dummy], axis = 1)
    return df

In [29]:
# Llamar a la función

createdummies(dfhr2, "CitizenDesc").head(10)

Unnamed: 0,Employee_Name,Position,State,Salary,DOB,MaritalDesc,Department,PerformanceScore,EngagementSurvey,EmpSatisfaction,sex_F,sex_M,CitizenDesc_Eligible NonCitizen,CitizenDesc_Non-Citizen,CitizenDesc_US Citizen
0,"Adinolfi, Wilson K",Production Technician I,MA,62506,07/10/83,Single,Production,Exceeds,4.6,5,0,1,0,0,1
1,"Ait Sidi, Karthikeyan",Sr. DBA,MA,104437,05/05/75,Married,IT/IS,Fully Meets,4.96,3,0,1,0,0,1
2,"Akinkuolie, Sarah",Production Technician II,MA,64955,09/19/88,Married,Production,Fully Meets,3.02,3,1,0,0,0,1
3,"Alagbe,Trina",Production Technician I,MA,64991,09/27/88,Married,Production,Fully Meets,4.84,5,1,0,0,0,1
4,"Anderson, Carol",Production Technician I,MA,50825,09/08/89,Divorced,Production,Fully Meets,5.0,4,1,0,0,0,1
5,"Anderson, Linda",Production Technician I,MA,57568,05/22/77,Single,Production,Exceeds,5.0,5,1,0,0,0,1
6,"Andreola, Colby",Software Engineer,MA,95660,05/24/79,Single,Software Engineering,Fully Meets,3.04,3,1,0,0,0,1
7,"Athwal, Sam",Production Technician I,MA,59365,02/18/83,Widowed,Production,Fully Meets,5.0,4,0,1,0,0,1
8,"Bachiochi, Linda",Production Technician I,MA,47837,02/11/70,Single,Production,Fully Meets,4.46,3,1,0,0,0,1
9,"Bacong, Alejandro",IT Support,MA,50178,01/07/88,Divorced,IT/IS,Fully Meets,5.0,5,0,1,0,0,1


In [30]:
# práctica mía con la columna de estado marital

dummies_marital = createdummies(dfhr1, "MaritalDesc")
dummies_marital.head(15)

Unnamed: 0,Employee_Name,Position,State,Salary,DOB,CitizenDesc,Department,PerformanceScore,EngagementSurvey,EmpSatisfaction,MaritalDesc_Divorced,MaritalDesc_Married,MaritalDesc_Separated,MaritalDesc_Single,MaritalDesc_Widowed
0,"Adinolfi, Wilson K",Production Technician I,MA,62506,07/10/83,US Citizen,Production,Exceeds,4.6,5,0,0,0,1,0
1,"Ait Sidi, Karthikeyan",Sr. DBA,MA,104437,05/05/75,US Citizen,IT/IS,Fully Meets,4.96,3,0,1,0,0,0
2,"Akinkuolie, Sarah",Production Technician II,MA,64955,09/19/88,US Citizen,Production,Fully Meets,3.02,3,0,1,0,0,0
3,"Alagbe,Trina",Production Technician I,MA,64991,09/27/88,US Citizen,Production,Fully Meets,4.84,5,0,1,0,0,0
4,"Anderson, Carol",Production Technician I,MA,50825,09/08/89,US Citizen,Production,Fully Meets,5.0,4,1,0,0,0,0
5,"Anderson, Linda",Production Technician I,MA,57568,05/22/77,US Citizen,Production,Exceeds,5.0,5,0,0,0,1,0
6,"Andreola, Colby",Software Engineer,MA,95660,05/24/79,US Citizen,Software Engineering,Fully Meets,3.04,3,0,0,0,1,0
7,"Athwal, Sam",Production Technician I,MA,59365,02/18/83,US Citizen,Production,Fully Meets,5.0,4,0,0,0,0,1
8,"Bachiochi, Linda",Production Technician I,MA,47837,02/11/70,US Citizen,Production,Fully Meets,4.46,3,0,0,0,1,0
9,"Bacong, Alejandro",IT Support,MA,50178,01/07/88,US Citizen,IT/IS,Fully Meets,5.0,5,1,0,0,0,0
