In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# ------------------------------------------------------------------------------
from scipy.stats import shapiro, kstest

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("raw_data.csv") # index_col=0 se quita, no hay indice

In [3]:
# FALTA - limpiar/sacar informacion de `roledepartment` porque está destinada a eliminación

**Transformamos: quitamos $ y cambiamos "," por "."**


`sameasmonthlyincome`, `salary`, `monthlyincome`, `monthlyrate`, `performancerating`, `worklifebalance`, `totalworkingyears`, `yearsincurrentrole`

In [4]:
lista_errores = []

def cambiar_comas(dato):
    try:
        # Primero quitamos el símbolo $, luego cambiamos coma por punto
        dato_limpio = dato.replace("$", "").replace(",", ".")
        return float(dato_limpio)
    except:
        lista_errores.append(dato)
        return np.nan


In [5]:
lista_col = ["sameasmonthlyincome",  "salary", "monthlyincome", "monthlyrate", "performancerating", "worklifebalance", "totalworkingyears", "yearsincurrentrole"]

In [6]:
for col in lista_col:
    print(col)
    df[col] = df[col].apply(cambiar_comas)

sameasmonthlyincome
salary
monthlyincome
monthlyrate
performancerating
worklifebalance
totalworkingyears
yearsincurrentrole


In [7]:
df[["sameasmonthlyincome",  "salary", "monthlyincome", "monthlyrate", "performancerating", "worklifebalance", "totalworkingyears", "yearsincurrentrole"]]

Unnamed: 0,sameasmonthlyincome,salary,monthlyincome,monthlyrate,performancerating,worklifebalance,totalworkingyears,yearsincurrentrole
0,16280.83,195370.00,16280.83,42330.17,3.0,3.0,,
1,,199990.00,,43331.17,3.0,3.0,34.0,
2,,192320.00,,41669.33,3.0,,22.0,
3,14307.50,171690.00,14307.50,37199.50,3.0,,,
4,12783.92,,12783.92,33238.20,3.0,3.0,,
...,...,...,...,...,...,...,...,...
1673,3949.17,,3949.17,10267.83,3.0,3.0,,
1674,15943.72,191324.62,15943.72,41453.67,3.0,3.0,27.0,
1675,,28111.13,,6090.75,3.0,3.0,6.0,
1676,8339.32,100071.84,8339.32,21682.23,,3.0,,


**Transformamos: redondeamos a 2 decimales `hourlyrate`, `dailyrate`**

In [8]:
def redondear_dos_decimales(valor):
    try:
        return round(float(valor), 2)
    except:
        return np.nan 


In [9]:
columnas_a_redondear = ["hourlyrate", "dailyrate"]

In [10]:
for col in columnas_a_redondear:
    print(col)
    df[col] = df[col].apply(redondear_dos_decimales)

hourlyrate
dailyrate


In [11]:
df[["dailyrate", "hourlyrate"]]

Unnamed: 0,dailyrate,hourlyrate
0,2015.72,
1,2063.39,
2,1984.25,
3,1771.40,
4,1582.77,
...,...,...
1673,488.94,
1674,1973.98,
1675,290.04,
1676,1032.49,


**Rellenar Rates que tienen NaN `hourlyrate` - lo que se cobra al cliente, `salary`, `montlyincome` - lo que gana el trabajador**


In [12]:
df["monthlyrate"].isnull().sum()

np.int64(0)

In [13]:
df["monthlyrate"].unique()

array([42330.17, 43331.17, 41669.33, 37199.5 , 33238.2 , 37210.33,
       21682.23, 11681.39, 35955.83, 41453.67, 43274.83, 29037.67,
       28613.  , 42347.5 , 39088.83, 41699.67, 23287.33, 36296.  ,
       13435.5 , 42997.5 ,  8668.83, 22635.17, 34805.33,  6955.  ,
       22243.  , 22695.83, 13351.  , 10228.83,  6090.75,  9999.17,
       36744.5 , 22329.67, 11713.  , 12787.67, 23519.17, 12813.67,
       14399.67, 30274.83, 28860.  , 14488.5 , 10259.17, 30056.  ,
       41264.17, 34699.17, 20828.17, 24938.33,  9329.67, 36933.  ,
       10978.5 ,  7999.33, 43001.83,  5000.67, 12451.83, 22581.  ,
       13751.83,  5087.33,  6656.  , 27061.67, 17376.67, 36980.67,
       19376.5 , 41756.  , 12083.5 ,  5830.5 , 16039.83, 19116.5 ,
        7754.5 , 38521.17, 39461.5 , 29416.83, 41578.33, 37099.83,
       41571.83, 21684.  , 17307.33, 15346.5 , 10233.17,  7381.83,
        6346.17, 26067.17, 33425.17, 11106.33, 20841.17, 10855.  ,
       41238.17, 22533.33,  6051.5 , 12293.67, 41593.5 , 18226

In [14]:
df["monthlyincome"].isnull().sum()

np.int64(489)

In [15]:
df["salary"].isnull().sum()

np.int64(285)

In [16]:
df.shape

(1678, 42)

In [17]:
df[["hourlyrate","dailyrate", "monthlyrate", "salary", "monthlyincome"]].sample(20)

Unnamed: 0,hourlyrate,dailyrate,monthlyrate,salary,monthlyincome
1231,36.25,290.04,6090.75,28111.13,2342.59
32,246.75,1973.98,41453.67,191324.62,15943.72
388,197.85,1582.77,33238.2,153407.07,12783.92
1246,,1032.49,21682.23,,8339.32
761,,290.04,6090.75,28111.13,2342.59
132,,290.04,6090.75,28111.13,2342.59
327,,556.26,11681.39,53914.11,
921,,2013.25,42278.17,,16260.83
54,,556.26,11681.39,,4492.84
320,,865.84,18182.67,83920.0,


**Transformamos: modificamos columnas a minuscula**


`"department`, `educationfield`, `attrition`, `jobrole`, `maritalstatus`, `over18`, `overtime`, `standardhours`, `roledepartament`, `remotework`

In [18]:
cambios_a_minuscula = ["department", "educationfield", "attrition", "jobrole", "maritalstatus", "over18", "overtime", "standardhours", "roledepartament", "remotework"]

In [19]:
for col in cambios_a_minuscula:
    df[col] = df[col].apply(lambda dato: dato.lower() if type(dato) == str else dato)

In [20]:
df[["department", "educationfield", "attrition", "jobrole", "maritalstatus", "over18", "overtime", "standardhours", "roledepartament", "remotework"]]

Unnamed: 0,department,educationfield,attrition,jobrole,maritalstatus,over18,overtime,standardhours,roledepartament,remotework
0,,,no,research director,,y,no,full time,,yes
1,,life sciences,no,manager,,,,,,1
2,research & development,technical degree,no,manager,married,,no,,manager - research & development,1
3,,medical,no,research director,married,y,,full time,,false
4,,technical degree,no,sales executive,divorced,y,no,,,0
...,...,...,...,...,...,...,...,...,...,...
1673,,medical,no,research scientist,single,,,,,yes
1674,,,no,manager,married,y,no,full time,,false
1675,,,no,research scientist,,,no,part time,,false
1676,,life sciences,no,manufacturing director,divorced,,yes,part time,,yes


**Transformamos: columna `maritalstatus` 1 dato mal escritos**

In [21]:
df["maritalstatus"] = df["maritalstatus"].replace("marreid", "married")

In [22]:
df["maritalstatus"].value_counts()

maritalstatus
married     455
single      343
divorced    205
Name: count, dtype: int64

**Transformamos: valores NaN por 'non-travel' en columna ``businesstravel`` basados en la información que nos ha proporcionado nuestro enlace con el proyecto**

In [23]:
df["businesstravel"] = df["businesstravel"].fillna("non-travel")

In [24]:
df['businesstravel'].value_counts()

businesstravel
non-travel           894
travel_rarely        616
travel_frequently    168
Name: count, dtype: int64

**Transformamos: espaciado sobrante en valores de la columna ``department``, ``jobrole`` y ``educationfield``**

In [25]:
print(df['department'].unique())
print('-------------')
print(df['jobrole'].unique())
print(df['educationfield'].unique())

[nan ' research & development ' ' sales ' ' human resources ']
-------------
[' research director ' ' manager ' ' sales executive '
 ' manufacturing director ' ' research scientist '
 ' healthcare representative ' ' laboratory technician '
 ' sales representative ' ' human resources ']
[nan 'life sciences' 'technical degree' 'medical' 'other' 'marketing'
 'human resources']


In [26]:
def sin_espaciado_extra(series):
    return series.str.strip().str.replace(r'\s+', ' ', regex=True)

columnas_a_limpiar = ['department', 'jobrole', 'educationfield']  # Ejemplo de columnas

for c in columnas_a_limpiar:

    if c in df.columns:  
        df[c] = sin_espaciado_extra(df[c])


In [27]:
print(df['department'].unique())
print('-------------')
print(df['jobrole'].unique())
print('-------------')
print(df['educationfield'].unique())

# no nos hace falta limpiar 'roledepartment' porque está destinada a eliminación

[nan 'research & development' 'sales' 'human resources']
-------------
['research director' 'manager' 'sales executive' 'manufacturing director'
 'research scientist' 'healthcare representative' 'laboratory technician'
 'sales representative' 'human resources']
-------------
[nan 'life sciences' 'technical degree' 'medical' 'other' 'marketing'
 'human resources']


**DROP Columns - `employeecount`, `sameasmonthlyincome`,`numberchildren`, `over18`, `yearsincurrentrole`**

In [28]:
df = df.drop(['employeecount', 'sameasmonthlyincome', 'numberchildren', 'over18', 'yearsincurrentrole'], axis=1)

In [29]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,roledepartament,remotework
0,0,51,no,non-travel,2015.72,,6,3,,1,1,0,,3,5,research director,3,,16280.83,42330.17,7,no,13,3.0,3,full time,0,,5,3.0,20,15,15,1972,195370.0,,yes
1,1,52,no,non-travel,2063.39,,1,4,life sciences,2,3,0,,2,5,manager,3,,,43331.17,0,,14,3.0,1,,1,34.0,5,3.0,33,11,9,1971,199990.0,,1


KEEP en BBDD pero no en CSV: 
- numberchildren (todos nan, pero puede ser interesante para la empresa en el futuro)

**DROP Duplicate Rows - `employeenumber` - keep first, reset index**

Duplicados borrados, quardamos solo el primero, reset indice del df.

In [30]:
df = df.drop_duplicates(subset='employeenumber', keep='first').reset_index(drop=True)

In [31]:
df['employeenumber'].duplicated().any()

np.False_

**REPLACE `genders` - 0 = m, 1 = f**

In [32]:
df['gender'] = df['gender'].replace({0: 'm', 1: 'f'})

**REPLACE `maritalstatus`, `overtime` - NaN por "unknown"**

In [33]:
df['maritalstatus'].isna().sum()

np.int64(651)

In [34]:
df = df.fillna({'maritalstatus': 'unknown', 'overtime': 'unknown'})

**FILLNA `standardhours` - NaN = full time (dice Pili)**

In [35]:
df['standardhours'] = df['standardhours'].fillna('full time')

**GUARDAR datos limpios a .csv nuevo "raw_data_limpio.csv"**

In [36]:
df.to_csv("raw_data_limpio_3.csv", index=False)

In [None]:
df.shape
# raw_data.csv original: 1678 rows, 41 columns
# raw_data_limpio_2.csv: 1614 rows, 37 columns

(1614, 37)