In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# ------------------------------------------------------------------------------
from scipy.stats import shapiro, kstest

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("raw_data_limpio.csv", index_col= 0) ## de cara a la exportación para no duplicar el index fantasma

In [None]:
# QUEDA - si `overtime` deberia ser fillna a 'no' o 'unknown' -- esta abajo en replace comentado
# FALTA - limpiar/sacar informacion de `roledepartment` porque está destinada a eliminación

**Transformamos: quitamos $ y cambiamos "," por "."**


`sameasmonthlyincome`, `salary`, `monthlyincome`, `monthlyrate`, `performancerating`, `worklifebalance`, `totalworkingyears`, `yearsincurrentrole`

In [4]:
lista_errores = []

def cambiar_comas(dato):
    try:
        # Primero quitamos el símbolo $, luego cambiamos coma por punto
        dato_limpio = dato.replace("$", "").replace(",", ".")
        return float(dato_limpio)
    except:
        lista_errores.append(dato)
        return np.nan


In [5]:
lista_col = ["sameasmonthlyincome",  "salary", "monthlyincome", "monthlyrate", "performancerating", "worklifebalance", "totalworkingyears", "yearsincurrentrole"]

In [6]:
for col in lista_col:
    print(col)
    df[col] = df[col].apply(cambiar_comas)

sameasmonthlyincome
salary
monthlyincome
monthlyrate
performancerating
worklifebalance
totalworkingyears
yearsincurrentrole


In [7]:
df[["sameasmonthlyincome",  "salary", "monthlyincome", "monthlyrate", "performancerating", "worklifebalance", "totalworkingyears", "yearsincurrentrole"]]

Unnamed: 0_level_0,sameasmonthlyincome,salary,monthlyincome,monthlyrate,performancerating,worklifebalance,totalworkingyears,yearsincurrentrole
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
51,,,,,,,,
52,,,,,,,,
42,,,,,,,,
47,,,,,,,,
46,,,,,,,,
...,...,...,...,...,...,...,...,...
43,,,,,,,,
47,,,,,,,,
29,,,,,,,,
47,,,,,,,,


**Transformamos: redondeamos a 2 decimales `hourlyrate`, `dailyrate`**

In [8]:
def redondear_dos_decimales(valor):
    try:
        return round(float(valor), 2)
    except:
        return np.nan 


In [9]:
columnas_a_redondear = ["hourlyrate", "dailyrate"]

In [10]:
for col in columnas_a_redondear:
    print(col)
    df[col] = df[col].apply(redondear_dos_decimales)

hourlyrate
dailyrate


In [11]:
df[["dailyrate", "hourlyrate"]]

Unnamed: 0_level_0,dailyrate,hourlyrate
age,Unnamed: 1_level_1,Unnamed: 2_level_1
51,2015.72,
52,2063.39,
42,1984.25,
47,1771.40,
46,1582.77,
...,...,...
43,488.94,
47,1973.98,
29,290.04,
47,1032.49,


**Transformamos: modificamos columnas a minuscula**


`"department`, `educationfield`, `attrition`, `jobrole`, `maritalstatus`, `over18`, `overtime`, `standardhours`, `roledepartament`, `remotework`

In [12]:
cambios_a_minuscula = ["department", "educationfield", "attrition", "jobrole", "maritalstatus", "over18", "overtime", "standardhours", "roledepartament", "remotework"]

In [13]:
for col in cambios_a_minuscula:
    df[col] = df[col].apply(lambda dato: dato.lower() if type(dato) == str else dato)

In [14]:
df[["department", "educationfield", "attrition", "jobrole", "maritalstatus", "over18", "overtime", "standardhours", "roledepartament", "remotework"]]

Unnamed: 0_level_0,department,educationfield,attrition,jobrole,maritalstatus,over18,overtime,standardhours,roledepartament,remotework
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
51,,,no,research director,,y,no,full time,,yes
52,,life sciences,no,manager,,,,,,1
42,research & development,technical degree,no,manager,married,,no,,manager - research & development,1
47,,medical,no,research director,married,y,,full time,,false
46,,technical degree,no,sales executive,divorced,y,no,,,0
...,...,...,...,...,...,...,...,...,...,...
43,,medical,no,research scientist,single,,,,,yes
47,,,no,manager,married,y,no,full time,,false
29,,,no,research scientist,,,no,part time,,false
47,,life sciences,no,manufacturing director,divorced,,yes,part time,,yes


**Transformamos: columna `maritalstatus` 1 dato mal escritos**

In [15]:
df["maritalstatus"].value_counts()

maritalstatus
married     455
single      343
divorced    205
Name: count, dtype: int64

In [16]:
df["maritalstatus"] = df["maritalstatus"].replace("marreid", "married")

In [17]:
df["maritalstatus"].value_counts()

maritalstatus
married     455
single      343
divorced    205
Name: count, dtype: int64

**Transformamos: valores NaN por 'non-travel' en columna ``businesstravel`` basados en la información que nos ha proporcionado nuestro enlace con el proyecto**

In [18]:
df["businesstravel"] = df["businesstravel"].fillna("non-travel")

In [19]:
df['businesstravel'].value_counts()

businesstravel
non-travel           894
travel_rarely        616
travel_frequently    168
Name: count, dtype: int64

**Transformamos: espaciado sobrante en valores de la columna ``department``, ``jobrole`` y ``educationfield``**

In [20]:
print(df['department'].unique())
print('-------------')
print(df['jobrole'].unique())
print(df['educationfield'].unique())

[nan 'research & development' 'sales' 'human resources']
-------------
['research director' 'manager' 'sales executive' 'manufacturing director'
 'research scientist' 'healthcare representative' 'laboratory technician'
 'sales representative' 'human resources']
[nan 'life sciences' 'technical degree' 'medical' 'other' 'marketing'
 'human resources']


In [21]:
def sin_espaciado_extra(series):
    return series.str.strip().str.replace(r'\s+', ' ', regex=True)

columnas_a_limpiar = ['department', 'jobrole', 'educationfield']  # Ejemplo de columnas

for c in columnas_a_limpiar:

    if c in df.columns:  
        df[c] = sin_espaciado_extra(df[c])


In [22]:
print(df['department'].unique())
print('-------------')
print(df['jobrole'].unique())
print('-------------')
print(df['educationfield'].unique())

# no nos hace falta limpiar 'roledepartment' porque está destinada a eliminación

[nan 'research & development' 'sales' 'human resources']
-------------
['research director' 'manager' 'sales executive' 'manufacturing director'
 'research scientist' 'healthcare representative' 'laboratory technician'
 'sales representative' 'human resources']
-------------
[nan 'life sciences' 'technical degree' 'medical' 'other' 'marketing'
 'human resources']


**DROP Columns - `employeecount`, `sameasmonthlyincome`,`numberchildren`, `over18`, `yearsincurrentrole`**

In [23]:
df = df.drop(['employeecount', 'sameasmonthlyincome', 'numberchildren', 'over18', 'yearsincurrentrole'], axis=1)

In [24]:
df.head(2)

Unnamed: 0_level_0,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,roledepartament,remotework
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
51,no,non-travel,2015.72,,6,3,,1,1,0,,3,5,research director,3,,,,7,no,13,,3,full time,0,,5,,20,15,15,1972,,,yes
52,no,non-travel,2063.39,,1,4,life sciences,2,3,0,,2,5,manager,3,,,,0,,14,,1,,1,,5,,33,11,9,1971,,,1


KEEP en BBDD pero no en CSV: 
- numberchildren (todos nan, pero puede ser interesante para la empresa en el futuro)

**DROP Duplicate Rows - `employeenumber` - keep first, reset index**

Duplicados borrados, quardamos solo el primero, reset indice del df.

In [25]:
df = df.drop_duplicates(subset='employeenumber', keep='first').reset_index(drop=True)

In [26]:
df['employeenumber'].duplicated().any()

np.False_

**REPLACE `genders` - 0 = m, 1 = f**

In [27]:
df['gender'] = df['gender'].replace({0: 'm', 1: 'f'})

**REPLACE `maritalstatus` - NaN por "unknown"**

In [28]:
df['maritalstatus'].isna().sum()

np.int64(651)

In [29]:
df['maritalstatus'] = df['maritalstatus'].fillna('unknown')

**REPLACE `overtime` TBD - NaN por "unknown" o "no"**

In [30]:
# df['overtime'] = df['overtime'].fillna('unknown' OR 'no')

In [31]:
# df['overtime'].isna().sum()

**FILLNA `standardhours` - NaN = full time (dice Pili)**

In [32]:
df['standardhours'] = df['standardhours'].fillna('full time')

**GUARDAR datos limpios a .csv nuevo "raw_data_limpio.csv"**

In [33]:
df.to_csv("raw_data_limpio_2.csv", index=False)

In [None]:
df.shape
# raw_data.csv original: 1678 rows, 41 columns
# raw_data_limpio_2.csv: 1614 rows, 35 columns

(1614, 35)