In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# ------------------------------------------------------------------------------
from scipy.stats import shapiro, kstest

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../files/raw_data_limpio.csv')

df.head(5)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,51,no,non-travel,2015.72,research & development,6,3,unknown,1,1,m,251.965,3,5,research director,3,unknown,16280.83,42330.17,7,no,13,3.0,3,full time,0,,5,3.0,20,15,15,1972,195370.0,yes
1,52,no,non-travel,2063.39,unknown,1,4,life sciences,2,3,m,257.92375,2,5,manager,3,unknown,16665.833333,43331.17,0,unknown,14,3.0,1,full time,1,34.0,5,3.0,33,11,9,1971,199990.0,yes
2,42,no,travel_rarely,1984.25,unknown,4,2,technical degree,3,3,m,248.03125,3,5,manager,4,married,16026.666667,41669.33,1,no,11,3.0,4,full time,0,22.0,3,3.0,22,11,15,1981,192320.0,yes
3,47,no,travel_rarely,1771.4,research & development,2,4,medical,4,1,f,221.425,3,4,research director,3,married,14307.5,37199.5,3,unknown,19,3.0,2,full time,2,,2,3.0,20,5,6,1976,171690.0,no
4,46,no,non-travel,1582.77,sales,3,3,technical degree,5,1,f,197.84625,4,4,sales executive,1,divorced,12783.92,33238.2,2,no,12,3.0,4,full time,1,,5,3.0,19,2,8,1977,153407.04,no


# Resumen

## Notas importantes 2º EDA: 

~~Las columnas de `age` y `attrition` no aparecen en el csv.~~

~~Porcentaje de valores nulos elevados en las columnas: `department`, `educationfield`, `hourlyrate`, `overtime`.~~

- ~~Volver a introducir columnas `age` y `attrition`.~~
- ~~Transformar datos en columna `age`.~~
- ~~`distancefromhome`, valores aún en negativo.~~
- ~~`monthlyincome`, `monthlyrate`, `performancerating`, `totalworkingyears`, `worklifebalance`, `salary` ahora solo dan valores nulos.~~
- ~~`overtime` deberia ser fillna a 'no' o 'unknown'?~~
- ~~Limpiar/sacar informacion de `roledepartment` porque está destinada a eliminación.~~
- ~~`remotework` datos aun sin unificar.~~
- ~~`employeecount` solo se especifica que se ha eliminado en el índice.~~
- ~~`employeenumber` solo se especifica que se ha actualizado en el índice.~~
- ~~`maritalstatus`: no se especifica que se han actualizado los valores nulos a 'unknown'. ~~
- ~~`over18`, `yearsincurrentrole`, `sameasmonthlyincome`, `numberchildren`: no se especifica que la columna ha sido eliminada.~~
- ~~Otras columnas a revisar en la memoria: `envirenmentsatisfaction`, `gender`, `hourlyrate`, `jobinvolvement`, `joblevel`, `jobsatisfaction`, `relationshipsatisfaction`, `standardhours`, `numcompaniesworked`.~~




In [3]:
#confirmando columna: 'employeecount', 'sameasmonthlyincome', 'numberchildren', 'over18', 'yearsincurrentrole' han sido eliminadas: 

df.columns


# Importante: columnas de 'age' y 'attrition' no aparecen en el csv!!!

# NOTAS: en la memoria, la columna de 'Roledepartment' aparece con el simbolo de eliminar pero aun sigue en el csv. 

Index(['age', 'attrition', 'businesstravel', 'dailyrate', 'department',
       'distancefromhome', 'education', 'educationfield', 'employeenumber',
       'environmentsatisfaction', 'gender', 'hourlyrate', 'jobinvolvement',
       'joblevel', 'jobrole', 'jobsatisfaction', 'maritalstatus',
       'monthlyincome', 'monthlyrate', 'numcompaniesworked', 'overtime',
       'percentsalaryhike', 'performancerating', 'relationshipsatisfaction',
       'standardhours', 'stockoptionlevel', 'totalworkingyears',
       'trainingtimeslastyear', 'worklifebalance', 'yearsatcompany',
       'yearssincelastpromotion', 'yearswithcurrmanager', 'datebirth',
       'salary', 'remotework'],
      dtype='object')

- Comprobación columna 'age': 

In [4]:
# Importante: columnas de 'age' y 'attrition' no aparecen en el csv!!!
# datos aún tendrían que ser transformados 
df['age'].unique()

array([51, 52, 42, 47, 46, 48, 59, 41, 56, 38, 55, 40, 58, 35, 45, 33, 36,
       34, 53, 43, 60, 32, 37, 49, 39, 50, 44, 30, 29, 31, 54, 57, 27, 28,
       26, 25, 24, 23, 22, 21, 20, 19, 18])

- Comprobación columna 'businesstravel': 

In [5]:
# todo correcto; memoria estaba actualizada

df['businesstravel'].unique()

array(['non-travel', 'travel_rarely', 'travel_frequently'], dtype=object)

- Comprobación columna 'dailyrate': 

In [6]:
# todo correcto; memoria estaba actualizada

df['dailyrate'].unique()

array([2015.72, 2063.39, 1984.25, 1771.4 , 1582.77, 1771.92, 1032.49,
        556.26, 1712.18, 1973.98, 2060.71, 1382.75, 1362.52, 2016.55,
       1861.37, 1985.7 , 1108.92, 1728.38,  639.79, 2047.5 ,  412.8 ,
       1077.87, 1657.4 ,  331.19, 1059.19, 1080.75,  635.76,  487.09,
        290.04,  476.15, 1749.74, 1063.32,  557.76,  608.94, 1119.96,
        610.17,  685.7 , 1441.66, 1374.29,  689.93,  488.53, 1431.24,
       1964.96, 1652.34,  991.82, 1187.54,  444.27, 1758.71,  522.79,
        380.92, 2047.71,  238.13,  592.94, 1075.29,  654.85,  242.25,
        316.95, 1288.65,  827.46, 1760.98,  922.69, 1988.38,  575.4 ,
        277.64,  763.8 ,  910.31,  369.26, 1834.34, 1879.12, 1400.8 ,
       1979.92, 1766.66, 1979.61, 1032.57,  824.16,  730.79,  487.29,
        351.52,  302.2 , 1241.29, 1591.67,  528.87,  992.44,  516.9 ,
       1963.72, 1073.02,  288.17,  585.41, 1980.64,  867.9 ,  345.43,
        547.75,  789.7 , 1321.46,  679.51,  259.9 , 1764.18, 1353.65,
        323.04,  458

- Comprobación columna 'department': 

In [7]:
# estandarización y espaciado correcto; memoria estaba actualizada

df['department'].unique()

array(['research & development', 'unknown', 'sales', 'human resources'],
      dtype=object)

In [8]:
df['department'].value_counts()


department
research & development    987
sales                     459
unknown                   111
human resources            57
Name: count, dtype: int64

In [9]:
# numero total de valores nulos: 

df['department'].value_counts(dropna=False)

department
research & development    987
sales                     459
unknown                   111
human resources            57
Name: count, dtype: int64

In [10]:
# porcentaje de valores nulos (tal y como sugiere PILI):

round(df['department'].value_counts(dropna=False, normalize=True)*100, 2)

department
research & development    61.15
sales                     28.44
unknown                    6.88
human resources            3.53
Name: proportion, dtype: float64

- Comprobación columna 'distancefromhome': 

In [11]:
# algunos valores aún siguen en negativo

df['distancefromhome'].unique()

array([ 6,  1,  4,  2,  3, 22, 25,  9,  7, 23, 10, 12, 14, 13, 15,  8, 42,
       28, 37,  5, 16, 35, 26, 24, 29, 17, 21, 18, 30, 27, 20, 31, 39, 11,
       19, 33, 34, 46, 36, 45, 47, 32, 41, 49, 48, 38, 43, 40, 44])

- Comprobación columna 'education': 

In [12]:
# todo correcto; memoria estaba actualizada

df['education'].unique()

array([3, 4, 2, 1, 5])

- Comprobación columna 'educationfield': 

In [13]:
# estandarización y espaciado correcto; memoria estaba actualizada

df['educationfield'].unique()

array(['unknown', 'life sciences', 'technical degree', 'medical', 'other',
       'marketing', 'human resources'], dtype=object)

In [14]:
df['educationfield'].value_counts()


educationfield
unknown             745
life sciences       349
medical             276
marketing           104
technical degree     69
other                59
human resources      12
Name: count, dtype: int64

In [15]:
# numero total de valores nulos: 

df['educationfield'].value_counts(dropna=False)

educationfield
unknown             745
life sciences       349
medical             276
marketing           104
technical degree     69
other                59
human resources      12
Name: count, dtype: int64

In [16]:
# porcentaje de valores nulos (tal y como sugiere PILI):

round(df['educationfield'].value_counts(dropna=False, normalize=True)*100, 2)

educationfield
unknown             46.16
life sciences       21.62
medical             17.10
marketing            6.44
technical degree     4.28
other                3.66
human resources      0.74
Name: proportion, dtype: float64

- Comprobación columna 'employeecount': 

In [17]:
# confirmando que ha sido eliminada; la memoria no está actualizada

# HECHO - df['employeecount'].unique()

- Comprobación columna 'employeenumber': 

In [18]:
# todo correcto, cero duplicados; memoria sin actualizar

df['employeenumber'].unique()

array([   1,    2,    3, ..., 1612, 1613, 1614], shape=(1614,))

In [19]:
df['employeenumber'].value_counts()

employeenumber
1       1
2       1
3       1
4       1
5       1
       ..
1610    1
1611    1
1612    1
1613    1
1614    1
Name: count, Length: 1614, dtype: int64

In [20]:
len(df)

1614

In [21]:
df.duplicated(subset="employeenumber").sum()

np.int64(0)

- Comprobación columna 'environmentsatisfaction': 

In [22]:
# todo parece correcto; memoria sin actualizar 

df['environmentsatisfaction'].unique()

array([1, 3, 4, 2, 5])

- Comprobación columna 'gender': 

In [23]:
# todo correcto; memoria sin actualizar 

df['gender'].unique()

array(['m', 'f'], dtype=object)

- Comprobación columna 'hourlyrate': 

In [24]:
# redondeo correcto pero aún gran número de valores nulos; memoria sin actualizar

df['hourlyrate'].unique()

array([251.965  , 257.92375, 248.03125, 221.425  , 197.84625, 221.49   ,
       129.06125,  69.53   , 214.0225 , 246.7475 , 257.58875, 172.84   ,
       170.315  , 252.06875, 232.67125, 248.2125 ,  69.5325 , 138.615  ,
       216.05   ,  79.97   , 129.06   , 255.9375 ,  51.6    , 134.73375,
       246.75   , 207.17   ,  41.39875, 132.39875, 135.09375,  79.47   ,
        60.89   ,  36.25   ,  59.51875, 218.7175 ,  36.255  , 132.915  ,
        69.72   ,  76.1175 , 140.     ,  76.27125,  85.7125 , 180.2075 ,
       171.78625,  86.24   , 197.85   ,  61.07   , 178.905  , 245.62   ,
       206.5425 , 123.98   , 148.44   ,  55.53375, 219.83875,  65.34875,
        47.615  , 255.96   ,  29.76625,  74.1175 , 134.41   ,  81.85625,
        30.28125,  39.61875, 161.08125, 103.4325 , 220.1225 , 115.33625,
       248.5475 ,  71.93   ,  34.705  ,  95.475  , 113.79   ,  46.1575 ,
       229.2925 , 234.89   , 175.1    , 247.49   , 220.83   , 247.45125,
       129.07125, 103.02   ,  91.35   ,  60.91125, 

In [25]:
df['hourlyrate'].value_counts()

hourlyrate
69.53250     243
36.25500     220
129.06125    101
36.25000      75
69.53000      74
            ... 
247.48000      1
143.19375      1
67.42000       1
34.82000       1
139.86625      1
Name: count, Length: 707, dtype: int64

In [26]:
df['hourlyrate'].value_counts(dropna=False)

hourlyrate
69.53250     243
36.25500     220
129.06125    101
36.25000      75
69.53000      74
            ... 
247.48000      1
143.19375      1
67.42000       1
34.82000       1
139.86625      1
Name: count, Length: 707, dtype: int64

In [27]:
# porcentaje de valores nulos (tal y como sugiere PILI):

round(df['hourlyrate'].value_counts(dropna=False, normalize=True)*100, 2)

hourlyrate
69.53250     15.06
36.25500     13.63
129.06125     6.26
36.25000      4.65
69.53000      4.58
             ...  
247.48000     0.06
143.19375     0.06
67.42000      0.06
34.82000      0.06
139.86625     0.06
Name: proportion, Length: 707, dtype: float64

- Comprobación columna 'jobinvolvement': 

In [28]:
#todo correcto; memoria sin actualizar 

df['jobinvolvement'].unique()

array([3, 2, 4, 1])

- Comprobación columna 'joblevel': 

In [29]:
#todo correcto; memoria sin actualizar 

df['joblevel'].unique()

array([5, 4, 3, 2, 1])

- Comprobación columna 'jobrole': 

In [30]:
# estandarización y espaciado correcto; memoria actualizada

df['jobrole'].unique()

array(['research director', 'manager', 'sales executive',
       'manufacturing director', 'research scientist',
       'healthcare representative', 'laboratory technician',
       'sales representative', 'human resources'], dtype=object)

- Comprobación columna 'jobsatisfaction': 

In [31]:
# todo correcto; memoria sin actualizar 

df['jobsatisfaction'].unique()

array([3, 4, 1, 2])

- Comprobación columna 'maritalstatus': 

In [32]:
# Nulos sustituidos por 'unknown', estandarización y espaciado correcto; memoria sin actualizar (no se especifica que se han actualizado los valores nulos)

df['maritalstatus'].unique()

array(['unknown', 'married', 'divorced', 'single'], dtype=object)

In [33]:
df['maritalstatus'].value_counts(dropna=False)

maritalstatus
unknown     651
married     439
single      325
divorced    199
Name: count, dtype: int64

- Comprobación columna 'monthlyincome': 

In [34]:
# ahora solo da valores nulos! 

df['monthlyincome'].unique()

array([16280.83      , 16665.83333333, 16026.66666667, 14307.5       ,
       12783.92      , 14311.67      ,  8339.32      ,  4492.8425    ,
       13829.17      , 15943.72      , 16644.17      , 11168.33333333,
       11005.        , 16287.5       , 15034.17      , 16038.33      ,
        4492.84      ,  8956.67      , 13960.        ,  5167.5       ,
       16537.5       ,  3334.17      ,  8705.83      , 13386.66666667,
        2675.        ,  8555.        ,  8729.17      ,  5135.        ,
        3934.17      ,  2342.59416667,  3845.83      , 14132.5       ,
        8588.33      ,  4505.        ,  4918.33      ,  9045.83333333,
        4928.33      ,  5538.33      , 11644.16666667, 11100.        ,
        5572.5       ,  3945.83333333, 11560.        , 15870.83      ,
       13345.83      ,  8010.83      ,  9591.66666667,  3588.33      ,
       14205.        ,  4222.5       ,  3076.67      , 16539.16666667,
        1923.33333333, 12783.9225    ,  2342.59      , 15943.71833333,
      

In [35]:
df['monthlyincome'].value_counts(dropna=False)

monthlyincome
4492.840000     224
2342.590000     219
8339.320000     139
4492.842500      93
2342.594167      76
               ... 
1846.666667       1
3536.666667       1
1940.833333       1
3578.330000       1
13960.000000      1
Name: count, Length: 715, dtype: int64

In [36]:
# porcentaje de valores nulos (tal y como sugiere PILI):

round(df['monthlyincome'].value_counts(dropna=False, normalize=True)*100, 2)

monthlyincome
4492.840000     13.88
2342.590000     13.57
8339.320000      8.61
4492.842500      5.76
2342.594167      4.71
                ...  
1846.666667      0.06
3536.666667      0.06
1940.833333      0.06
3578.330000      0.06
13960.000000     0.06
Name: proportion, Length: 715, dtype: float64

- Comprobación columna 'monthlyrate': 

In [37]:
# ahora solo da valores nulos! 

df['monthlyrate'].unique()

array([42330.17, 43331.17, 41669.33, 37199.5 , 33238.2 , 37210.33,
       21682.23, 11681.39, 35955.83, 41453.67, 43274.83, 29037.67,
       28613.  , 42347.5 , 39088.83, 41699.67, 23287.33, 36296.  ,
       13435.5 , 42997.5 ,  8668.83, 22635.17, 34805.33,  6955.  ,
       22243.  , 22695.83, 13351.  , 10228.83,  6090.75,  9999.17,
       36744.5 , 22329.67, 11713.  , 12787.67, 23519.17, 12813.67,
       14399.67, 30274.83, 28860.  , 14488.5 , 10259.17, 30056.  ,
       41264.17, 34699.17, 20828.17, 24938.33,  9329.67, 36933.  ,
       10978.5 ,  7999.33, 43001.83,  5000.67, 12451.83, 22581.  ,
       13751.83,  5087.33,  6656.  , 27061.67, 17376.67, 36980.67,
       19376.5 , 41756.  , 12083.5 ,  5830.5 , 16039.83, 19116.5 ,
        7754.5 , 38521.17, 39461.5 , 29416.83, 41578.33, 37099.83,
       41571.83, 21684.  , 17307.33, 15346.5 , 10233.17,  7381.83,
        6346.17, 26067.17, 33425.17, 11106.33, 20841.17, 10855.  ,
       41238.17, 22533.33,  6051.5 , 12293.67, 41593.5 , 18226

In [38]:
df['monthlyrate'].value_counts(dropna=False)

monthlyrate
11681.39    317
6090.75     295
21682.23    139
33238.20     55
41453.67     37
           ... 
11160.50      1
6203.17       1
6192.33       1
2446.17       1
11713.00      1
Name: count, Length: 673, dtype: int64

In [39]:
# porcentaje de valores nulos (tal y como sugiere PILI):

round(df['monthlyrate'].value_counts(dropna=False, normalize=True)*100, 2)

monthlyrate
11681.39    19.64
6090.75     18.28
21682.23     8.61
33238.20     3.41
41453.67     2.29
            ...  
11160.50     0.06
6203.17      0.06
6192.33      0.06
2446.17      0.06
11713.00     0.06
Name: proportion, Length: 673, dtype: float64

- Comprobación columna 'numcompaniesworked': 

In [40]:
# todo correcto; memoria sin actualizar 

df['numcompaniesworked'].unique()

array([7, 0, 1, 3, 2, 4, 8, 9, 5, 6])

- Comprobación columna 'over18': 

In [41]:
# confirmando que ha sido eliminada; la memoria no está actualizada

# HECHO - df['over18'].unique()

- Comprobación columna 'overtime': 

In [42]:
# consulta de nulos con PILI?

df['overtime'].unique()

array(['no', 'unknown', 'yes'], dtype=object)

In [43]:
df['overtime'].value_counts()

overtime
no         682
unknown    676
yes        256
Name: count, dtype: int64

In [44]:
df['overtime'].value_counts(dropna=False)

overtime
no         682
unknown    676
yes        256
Name: count, dtype: int64

In [45]:
# porcentaje de valores nulos (tal y como sugiere PILI):

round(df['overtime'].value_counts(dropna=False, normalize=True)*100, 2)

overtime
no         42.26
unknown    41.88
yes        15.86
Name: proportion, dtype: float64

- Comprobación columna 'percentsalaryhike': 

In [46]:
# todo correcto; memoria actualizada

df['percentsalaryhike'].unique()

array([13, 14, 11, 19, 12, 25, 16, 17, 22, 23, 20, 15, 21, 24, 18])

- Comprobación columna 'performancerating': 

In [47]:
# solo da valores nulos! 

df['performancerating'].unique()

array([ 3.,  4., nan])

In [48]:
df['performancerating'].value_counts()

performancerating
3.0    1205
4.0     214
Name: count, dtype: int64

In [49]:
df['performancerating'].value_counts(dropna=False)

performancerating
3.0    1205
4.0     214
NaN     195
Name: count, dtype: int64

In [50]:
# porcentaje de valores nulos (tal y como sugiere PILI):

round(df['performancerating'].value_counts(dropna=False, normalize=True)*100, 2)

performancerating
3.0    74.66
4.0    13.26
NaN    12.08
Name: proportion, dtype: float64

- Comprobación columna 'relationshipsatisfaction': 

In [51]:
# todo correcto; memoria sin actualizar 

df['relationshipsatisfaction'].unique()

array([3, 1, 4, 2])

- Comprobación columna 'standardhours': 

In [52]:
# estandarización correcta y valores nulos eliminados; memoria sin actualizar 

df['standardhours'].unique()

array(['full time', 'part time'], dtype=object)

- Comprobación columna 'stockoptionlevel': 

In [53]:
# todo correcto; memoria actualizada 

df['stockoptionlevel'].unique()

array([0, 1, 2, 3])

- Comprobación columna 'totalworkingyears': 

In [54]:
# ahora solo aparecen nulos! 

df['totalworkingyears'].unique()

array([nan, 34., 22., 28., 20., 21., 33., 40., 18., 25., 15., 17., 26.,
       16., 24., 14., 23., 27., 19., 11., 38., 37., 13., 12., 29., 10.,
       36., 35.,  9., 31., 32.,  8.,  7., 30.,  6.,  5.,  4.,  3.,  2.,
        1.,  0.])

- Comprobación columna 'trainingtimeslastyear': 

In [55]:
# todo correcto; memoria actualizada

df['trainingtimeslastyear'].unique()

array([5, 3, 2, 0, 1, 4, 6])

- Comprobación columna 'worklifebalance': 

In [56]:
# ahora solo aparecen nulos! 

df['worklifebalance'].unique()

array([3., 2., 4., 1.])

- Comprobación columna 'yearsatcompany': 

In [57]:
# todo correcto; memoria actualizada

df['yearsatcompany'].unique()

array([20, 33, 22, 19, 21, 18, 24, 31, 26, 16, 23, 15, 17, 32, 14, 13, 25,
       12, 11, 37, 40, 36, 27, 29, 10,  9, 30,  8,  7, 34,  6,  5,  4,  2,
        3,  1,  0])

- Comprobación columna 'yearsincurrentrole': 

In [58]:
# confirmando que ha sido eliminada; la memoria no está actualizada

# HECHO - df['yearsincurrentrole'].unique()

- Comprobación columna 'yearssincelastpromotion': 

In [59]:
# todo correcto; memoria estaba actualizada

df['yearssincelastpromotion'].unique()

array([15, 11,  5,  2,  4,  7,  0,  1, 13, 14,  8, 12,  3,  6, 10,  9])

- Comprobación columna 'yearswithcurrmanager': 

In [60]:
# todo correcto; memoria estaba actualizada

df['yearswithcurrmanager'].unique()

array([15,  9,  6,  8,  7, 11, 10, 12,  4,  0,  5, 17,  2, 14,  1, 13,  3,
       16])

- Comprobación columna 'sameasmonthlyincome': 

In [61]:
# confirmando que ha sido eliminada; la memoria no está actualizada

# HECHO - df['sameasmonthlyincome'].unique()

- Comprobación columna 'datebirth': 

In [62]:
# todo correcto; memoria estaba actualizada

df['datebirth'].unique()

array([1972, 1971, 1981, 1976, 1977, 1975, 1964, 1982, 1967, 1985, 1968,
       1983, 1965, 1988, 1978, 1990, 1987, 1989, 1970, 1980, 1963, 1991,
       1986, 1974, 1984, 1973, 1979, 1993, 1994, 1992, 1969, 1966, 1996,
       1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005])

- Comprobación columna 'salary': 

In [63]:
# ahora solo aparecen nulos! 

df['salary'].unique()

array([ 195370.  ,  199990.  ,  192320.  ,  171690.  ,  153407.04,
        171740.04,  100071.84,   53914.11,  165950.  ,  191324.64,
        199730.  ,  134020.  ,  132060.  ,  195450.  ,  180410.  ,
        192460.  ,   53914.08,  107480.  ,  167520.  ,   62010.  ,
        198450.  ,   40010.  ,  104470.  ,  191324.62,  160640.  ,
         32100.  ,  102660.  ,  104750.  ,   61620.  ,   47210.  ,
        153407.07,   28111.13,   46150.  ,  169590.  ,  103059.96,
         54060.  ,   59020.  ,  108550.  ,   59140.  ,   66460.  ,
        139730.  ,  133200.  ,   66870.  ,   47350.  ,  138720.  ,
        190450.  ,  160149.96,   96130.  ,  115100.  ,   43059.96,
        170460.  ,   50670.  ,   36920.  ,  198470.  ,   23080.  ,
         57470.  ,  104220.  ,   63470.  ,   23480.  ,   30720.  ,
        124900.  ,   80200.  ,  170680.  ,   89430.  ,  192720.  ,
         55770.  ,   26910.  ,   74030.  ,   88230.  ,   35790.  ,
        177790.  ,  182130.  ,  135770.  ,  191900.  ,  171230

- Comprobación columna 'roledepartament': 

In [64]:
# columna destinada a ser eliminada 

# HECHO - df['roledepartament'].unique()

- Comprobación columna 'numberchildren': 

In [65]:
# confirmando que ha sido eliminada; la memoria no está actualizada

# HECHO - df['numberchildren'].unique()

- Comprobación columna 'remotework': 

In [66]:
# datos aún sin unificar 

df['remotework'].unique()

array(['yes', 'no'], dtype=object)