In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

In [59]:
df = pd.read_csv('../data/trabajo1.csv')
df.head(3)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60.0,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50.0,0


In [4]:
target = df["is_promoted"]
df_data = df.copy()
df_data.drop(columns = ["is_promoted", "employee_id"], inplace = True);

In [5]:
df_data.shape[1]

11

In [6]:
df.shape[1]

13

**Proporción de promociones**

In [7]:
target.value_counts(normalize = True)

0    0.91483
1    0.08517
Name: is_promoted, dtype: float64

No está balanceada. Para ver errores, utilizaremos esta función:

$$ error(y, y_{est}) = \beta * FP + FN $$

Donde:

   * `FP`: false positive. Predicción es *promociona* pero en realidad *no promociona*
   * `FN`: false negative. Predicción es *no promociona* pero en realidad *promociona*

Nos importan $\beta$ veces más los falsos positivos que los falsos negativos. Es decir, predecir que alguien promociona cuando en realidad NO promociona lo consideramos peor que predecir que alguien NO promociona cuando en realidad va a promocionar.

Queremos maximizar la sensibilidad del modelo para evitar los falsos positivos:

$$ Sensitivity = \frac{(TP)}{(TP + FN)} $$

**Encoding variables**

Categorical variables:
   * department
   * region
   * education (has nan)
   * gender
   * recruitment_channel

In [8]:
 from sklearn import preprocessing

In [74]:
def encodeCategoricalVariable(variable_to_encode, dataframe, binary):
    '''
    
    Encodes a variable from the main df and returns an array with the encoded variables.

            Parameters:
                    variable_to_encode (string): name of the column from main df to encode
                    dataframe (DataFrame): dataframe with all the information
                    binary (boolean): True if a binary encoding is to be performed. False for integer encoding

            Returns:
                    encoded_variable (array): array with numerical values from encoded column. It can be 2D array if binary encoding is used
    '''
    if binary:
        le = preprocessing.LabelBinarizer() # Creamos un label encoder
    else:
        le = preprocessing.LabelEncoder() # Creamos un label encoder
    
    le.fit(dataframe[variable_to_encode])    # Lo ajustamos a la variable en cuestión
    encoded_variable = le.transform(dataframe[variable_to_encode]) # Transformamos la variable 
    
    return encoded_variable


In [71]:
def includeEncodedVariablesInDataframe(variable_that_was_encoded, dataframe_where_to_include, encoded_array):
    '''
    
    Includes an encoded variable in the specified dataframe.

            Parameters:
                    variable_that_was_encoded (string): name of the column that was encoded
                    dataframe_where_to_include (DataFrame): df where to include the encoded variable
                    encoded_array (array): array with the encoding of the  variable. It can be binary or not. It accepts 2D and 1D arrays

            Returns:
                    df_to_return (DataFrame): dataframe_where_to_include with the encoded variables included
    '''    
    
    # The shape of the encoded array shape tuple is 2 if it is a binary encoded variable with several categories
    # binary_variable will be True if encoded_array is a 2D array. It will be False if it is a 1D array
    binary_variable = (len(encoded_array.shape) == 2)
    
    if binary_variable:
        
        # We obtain the number of binary values (number of columns to add to the dataframe)
        num_binary_values = encoded_array.shape[1]
        
        # We create the names for the columns
        list_of_names = [ variable_that_was_encoded + "_" + str(index) for index in range(num_binary_values)]
        
        # We create the dataframe
        binary_dataframe = pd.DataFrame(data = encoded_array, columns = list_of_names)
        
        
        # We combine the dataframes
        df_to_return = dataframe_where_to_include.join(binary_dataframe, how='outer')
        
        # Return the final df
        return df_to_return
        
    else:
        dataframe_where_to_include[variable_that_was_encoded] = encoded_array
        df_to_return = dataframe_where_to_include.copy()
        
        return df_to_return
        

Example of use:

In [72]:
df_data.head(3)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score
0,7,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0
1,4,region_22,Bachelor's,m,other,1,30,5.0,4,0,60.0
2,7,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50.0


In [78]:
encoded_variable = encodeCategoricalVariable("department", df_data, True)

In [80]:
includeEncodedVariablesInDataframe("department", df_data, encoded_variable).head(3)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,department_0,department_1,department_2,department_3,department_4,department_5,department_6,department_7,department_8
0,7,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0,0,0,0,0,0,0,0,1,0
1,4,region_22,Bachelor's,m,other,1,30,5.0,4,0,60.0,0,0,0,0,1,0,0,0,0
2,7,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50.0,0,0,0,0,0,0,0,1,0


We build arrays for automating the functions

In [82]:
variables_to_encode = ["department", "region", "gender", "recruitment_channel"]
binary_encodings = [True, False, False, True]

In [83]:
df_data_encoded = df_data.copy()

for i in range(len(variables_to_encode)):
    variable = variables_to_encode[i]
    binary_encoding = binary_encodings[i]
    
    encoded_variable = encodeCategoricalVariable(variable, df_data_encoded, True)
    df_data_encoded = includeEncodedVariablesInDataframe(variable, df_data_encoded, encoded_variable).copy()

In [85]:
df_data_encoded.head(3)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,...,region_28,region_29,region_30,region_31,region_32,region_33,gender_0,recruitment_channel_0,recruitment_channel_1,recruitment_channel_2
0,7,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,...,0,0,0,1,0,0,0,0,0,1
1,4,region_22,Bachelor's,m,other,1,30,5.0,4,0,...,0,0,0,0,0,0,1,1,0,0
2,7,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,...,0,0,0,0,0,0,1,0,0,1


**Separación en test/train**

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df_data,target,test_size = 0.33,random_state = 123) # Random state es para repetir siempre el mismo resultado

In [36]:
df.isna().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
awards_won                 0
avg_training_score      2560
is_promoted                0
dtype: int64