In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [59]:
df = pd.read_csv("raw_data.csv", index_col= 0) ## de cara a la exportación para no duplicar el index fantasma

**Transformamos: quitamos $ y cambiamos "," por "."**


In [60]:
lista_errores = []

def cambiar_comas(dato):
    try:
        # Primero quitamos el símbolo $, luego cambiamos coma por punto
        dato_limpio = dato.replace("$", "").replace(",", ".")
        return float(dato_limpio)
    except:
        lista_errores.append(dato)
        return np.nan


In [61]:
lista_col = ["sameasmonthlyincome",  "salary", "monthlyincome", "monthlyrate", "performancerating", "worklifebalance", "totalworkingyears", "yearsincurrentrole"]

In [62]:
for col in lista_col:
    print(col)
    df[col] = df[col].apply(cambiar_comas)

sameasmonthlyincome
salary
monthlyincome
monthlyrate
performancerating
worklifebalance
totalworkingyears
yearsincurrentrole


In [63]:
df[["sameasmonthlyincome",  "salary", "monthlyincome", "monthlyrate", "performancerating", "worklifebalance", "totalworkingyears", "yearsincurrentrole"]]

Unnamed: 0,sameasmonthlyincome,salary,monthlyincome,monthlyrate,performancerating,worklifebalance,totalworkingyears,yearsincurrentrole
0,16280.83,195370.00,16280.83,42330.17,3.0,3.0,,
1,,199990.00,,43331.17,3.0,3.0,34.0,
2,,192320.00,,41669.33,3.0,,22.0,
3,14307.50,171690.00,14307.50,37199.50,3.0,,,
4,12783.92,,12783.92,33238.20,3.0,3.0,,
...,...,...,...,...,...,...,...,...
1673,3949.17,,3949.17,10267.83,3.0,3.0,,
1674,15943.72,191324.62,15943.72,41453.67,3.0,3.0,27.0,
1675,,28111.13,,6090.75,3.0,3.0,6.0,
1676,8339.32,100071.84,8339.32,21682.23,,3.0,,


**Transformamos: redondeamos a 2 decimales**

In [64]:
def redondear_dos_decimales(valor):
    try:
        return round(float(valor), 2)
    except:
        return np.nan 


In [65]:
columnas_a_redondear = ["hourlyrate", "dailyrate"]

In [66]:
for col in columnas_a_redondear:
    print(col)
    df[col] = df[col].apply(redondear_dos_decimales)

hourlyrate
dailyrate


In [67]:
df[["dailyrate", "hourlyrate"]]

Unnamed: 0,dailyrate,hourlyrate
0,2015.72,
1,2063.39,
2,1984.25,
3,1771.40,
4,1582.77,
...,...,...
1673,488.94,
1674,1973.98,
1675,290.04,
1676,1032.49,


**Transformamos: modificamos columnas a minuscula**


In [68]:
cambios_a_minuscula = ["department", "educationfield", "attrition", "jobrole", "maritalstatus", "over18", "overtime", "standardhours", "roledepartament", "remotework"]

In [69]:
for col in cambios_a_minuscula:
    df[col] = df[col].apply(lambda dato: dato.lower() if type(dato) == str else dato)

In [70]:
df[["department", "educationfield", "attrition", "jobrole", "maritalstatus", "over18", "overtime", "standardhours", "roledepartament", "remotework"]]

Unnamed: 0,department,educationfield,attrition,jobrole,maritalstatus,over18,overtime,standardhours,roledepartament,remotework
0,,,no,research director,,y,no,full time,,yes
1,,life sciences,no,manager,,,,,,1
2,research & development,technical degree,no,manager,married,,no,,manager - research & development,1
3,,medical,no,research director,married,y,,full time,,false
4,,technical degree,no,sales executive,divorced,y,no,,,0
...,...,...,...,...,...,...,...,...,...,...
1673,,medical,no,research scientist,single,,,,,yes
1674,,,no,manager,married,y,no,full time,,false
1675,,,no,research scientist,,,no,part time,,false
1676,,life sciences,no,manufacturing director,divorced,,yes,part time,,yes


**Transformamos: columna "maritalstatus" 1 datos mal escritos**

In [71]:
df["maritalstatus"].value_counts()

maritalstatus
married     419
single      343
divorced    205
marreid      36
Name: count, dtype: int64

In [72]:
df["maritalstatus"] = df["maritalstatus"].replace("marreid", "married")

In [73]:
df["maritalstatus"].value_counts()

maritalstatus
married     455
single      343
divorced    205
Name: count, dtype: int64

**Transformamos: valores NaN por 'non-travel' en columna ``businesstravel`` basados en la información que nos ha proporcionado nuestro enlace con el proyecto**

In [74]:
df["businesstravel"] = df["businesstravel"].fillna("non-travel")

In [75]:
df['businesstravel'].value_counts()

businesstravel
non-travel           894
travel_rarely        616
travel_frequently    168
Name: count, dtype: int64

Transformamos: espaciado sobrante en valores de la columna ``department``, ``jobrole`` y ``educationfield``

In [77]:
print(df['department'].unique())
print('-------------')
print(df['jobrole'].unique())
print(df['educationfield'].unique())

[nan ' research & development ' ' sales ' ' human resources ']
-------------
[' research director ' ' manager ' ' sales executive '
 ' manufacturing director ' ' research scientist '
 ' healthcare representative ' ' laboratory technician '
 ' sales representative ' ' human resources ']
[nan 'life sciences' 'technical degree' 'medical' 'other' 'marketing'
 'human resources']


In [78]:
def sin_espaciado_extra(series):
    return series.str.strip().str.replace(r'\s+', ' ', regex=True)

columnas_a_limpiar = ['department', 'jobrole', 'educationfield']  # Ejemplo de columnas

for c in columnas_a_limpiar:

    if c in df.columns:  
        df[c] = sin_espaciado_extra(df[c])


In [79]:
print(df['department'].unique())
print('-------------')
print(df['jobrole'].unique())
print('-------------')
print(df['educationfield'].unique())

# no nos hace falta limpiar 'roledepartment' porque está destinada a eliminación

[nan 'research & development' 'sales' 'human resources']
-------------
['research director' 'manager' 'sales executive' 'manufacturing director'
 'research scientist' 'healthcare representative' 'laboratory technician'
 'sales representative' 'human resources']
-------------
[nan 'life sciences' 'technical degree' 'medical' 'other' 'marketing'
 'human resources']


**Guardar datos limpios a .csv nuevo "raw_data_limpio.csv"**

In [80]:
df.to_csv("raw_data_limpio.csv", index=False)