In [1]:

# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../archivos/ABCcorporation2024.csv', index_col=[0])
df.head()

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,roledepartament,remotework
0,52,no,non-travel,2015.722222,,6,3,,1,1,M,,3,5,research director,3,,16280.83,42330.17,7,no,13,3.0,3,full time,0,,5,3.0,20,,15,15,1972,195370.0,,
1,53,no,non-travel,2063.388889,,1,4,life sciences,2,3,M,,2,5,manager,3,,16665.83,43331.17,0,,14,3.0,1,,1,34.0,5,3.0,33,,11,9,1971,199990.0,,si
2,43,no,travel_rarely,1984.253968,research & development,4,2,technical degree,3,3,M,,3,5,manager,4,married,16026.67,41669.33,1,no,11,3.0,4,,0,22.0,3,,22,,11,15,1981,192320.0,manager - research & development,si
3,48,no,travel_rarely,1771.404762,,2,4,medical,4,1,F,,3,4,research director,3,married,14307.5,37199.5,3,,19,3.0,2,full time,2,,2,,20,,5,6,1976,171690.0,,
4,47,no,non-travel,1582.771346,,3,3,technical degree,5,1,F,,4,4,sales executive,1,divorced,12783.92,33238.2,2,no,12,3.0,4,,1,,5,3.0,19,,2,8,1977,153407.04,,no


In [3]:
# Informacion general 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       1614 non-null   int64  
 1   attrition                 1614 non-null   object 
 2   businesstravel            1614 non-null   object 
 3   dailyrate                 1614 non-null   float64
 4   department                302 non-null    object 
 5   distancefromhome          1614 non-null   int64  
 6   education                 1614 non-null   int64  
 7   educationfield            869 non-null    object 
 8   employeenumber            1614 non-null   int64  
 9   environmentsatisfaction   1614 non-null   int64  
 10  gender                    1614 non-null   object 
 11  hourlyrate                404 non-null    float64
 12  jobinvolvement            1614 non-null   int64  
 13  joblevel                  1614 non-null   int64  
 14  jobrole      

In [None]:
# Nulos
nulos= df.isnull().sum()

# % Nulos
print(f'% Nulos: {(nulos/df.shape[0]*100).round(2)}')

% Nulos: age                          0.00
attrition                    0.00
businesstravel               0.00
dailyrate                    0.00
department                  81.29
distancefromhome             0.00
education                    0.00
educationfield              46.16
employeenumber               0.00
environmentsatisfaction      0.00
gender                       0.00
hourlyrate                  74.97
jobinvolvement               0.00
joblevel                     0.00
jobrole                      0.00
jobsatisfaction              0.00
maritalstatus               40.33
monthlyincome                0.00
monthlyrate                  0.00
numcompaniesworked           0.00
overtime                    41.88
percentsalaryhike            0.00
performancerating           12.08
relationshipsatisfaction     0.00
standardhours               20.94
stockoptionlevel             0.00
totalworkingyears           32.59
trainingtimeslastyear        0.00
worklifebalance              6.69
years

In [6]:
# principales estadisticos var numericas 
display(df.describe().T.round(2))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1614.0,37.92,9.1,19.0,31.0,37.0,44.0,61.0
dailyrate,1614.0,669.63,472.35,104.1,290.04,556.26,967.31,2063.39
distancefromhome,1614.0,11.14,10.45,1.0,2.0,8.0,17.0,49.0
education,1614.0,2.93,1.02,1.0,2.0,3.0,4.0,5.0
employeenumber,1614.0,807.5,466.07,1.0,404.25,807.5,1210.75,1614.0
environmentsatisfaction,1614.0,2.69,1.11,1.0,2.0,3.0,4.0,4.0
hourlyrate,404.0,83.04,57.55,13.01,36.25,69.53,114.16,255.96
jobinvolvement,1614.0,2.74,0.71,1.0,2.0,3.0,3.0,4.0
joblevel,1614.0,2.07,1.1,1.0,1.0,2.0,3.0,5.0
jobsatisfaction,1614.0,2.74,1.11,1.0,2.0,3.0,4.0,4.0


In [7]:
# principales estadisticos var categoricas
display(df.describe(include = "object").T.round(2))

Unnamed: 0,count,unique,top,freq
attrition,1614,2,no,1355
businesstravel,1614,3,non-travel,863
department,302,3,research & development,196
educationfield,869,6,life sciences,349
gender,1614,2,M,971
jobrole,1614,9,sales executive,369
maritalstatus,963,3,married,439
overtime,938,2,no,682
standardhours,1276,2,part time,888
roledepartament,302,11,sales executive - sales,69


# Hipótesis 1: Relación entre la satisfacción en el trabajo y la rotación de empleados ✨
- EnvironmentSatisfaction 
Ver la relacion entre ambas (lineal o no lineal) -> grafico de regresion y correlacion de pearson (lineal) o correlacion de spearman (no lineal) para cuantificar la relación entre ambas

# Hipótesis 4: Relación entre la edad/genero-la rotación de empleados ✨
Ver distribucion de la rotacion de empleados por genero e incluir la edad
